from pathlib import Path from kontextual_engine import ( Actor, ActorType, AssetIngestionService, AssetQueryRequest, AssetRegistryService, AssetRepresentation, AssetRetrievalService, Classification, ContextEntity, ContextEntityQueryRequest, ContextEntityType, InMemoryAssetRegistryRepository, LifecycleState, MetadataRecord, OperationContext, PolicyDecision, RelationshipQueryRequest, RepresentationKind, RetrievalFeedbackLabel, RetrievalFeedbackRequest, Sensitivity, SQLiteAssetRegistryRepository, SourceReference, ) def test_asset_retrieval_returns_stable_paginated_envelope() -> None: repo = InMemoryAssetRegistryRepository() registry = AssetRegistryService(repo) retrieval = AssetRetrievalService(repo) context = operation_context() source_ref = SourceReference(source_system="repo", path="docs/b.md", checksum="sha256:b") representation = AssetRepresentation.from_content( "asset-bravo", RepresentationKind.NORMALIZED, "text/plain", "bravo content", storage_ref="object://bravo-normalized", source_ref_id=source_ref.id, ) registry.create_asset( "Charlie", Classification(asset_type="note", sensitivity=Sensitivity.INTERNAL), context, asset_id="asset-charlie", ) registry.create_asset( "Bravo", Classification(asset_type="document", sensitivity=Sensitivity.PUBLIC, owner="Docs", topics=("retrieval",)), context, asset_id="asset-bravo", source_refs=[source_ref], representations=[representation], metadata_records=[MetadataRecord("status", "approved", confirmed=True)], ) registry.create_asset( "Alpha", Classification(asset_type="document", sensitivity=Sensitivity.PUBLIC, owner="Docs"), context, asset_id="asset-alpha", ) first_page = retrieval.query_assets( AssetQueryRequest(asset_type="document", sort_by="title", limit=1), context, ) second_page = retrieval.query_assets( AssetQueryRequest(asset_type="document", sort_by="title", limit=1, offset=1), context, ) assert first_page.success is True assert first_page.total == 2 assert first_page.result_count == 1 assert first_page.next_offset == 1 assert first_page.items[0].asset.id == "asset-alpha" assert second_page.next_offset is None assert second_page.items[0].asset.id == "asset-bravo" assert second_page.to_dict()["results"][0]["source_refs"][0]["path"] == "docs/b.md" assert second_page.to_dict()["results"][0]["representations"][0]["storage_ref"] == "object://bravo-normalized" assert second_page.to_dict()["results"][0]["metadata_records"][0]["key"] == "status" assert second_page.to_dict()["correlation_id"] == "corr-retrieval" def test_asset_retrieval_filters_source_metadata_lifecycle_and_representation_kind() -> None: repo = InMemoryAssetRegistryRepository() registry = AssetRegistryService(repo) retrieval = AssetRetrievalService(repo) context = operation_context() source_ref = SourceReference(source_system="repo", path="docs/guide.md", checksum="sha256:guide") registry.create_asset( "Guide", Classification( asset_type="guide", sensitivity=Sensitivity.INTERNAL, owner="Docs", topics=("retrieval",), review_state="approved", ), context, asset_id="asset-guide", source_refs=[source_ref], representations=[ AssetRepresentation.from_content( "asset-guide", RepresentationKind.NORMALIZED, "text/plain", "guide normalized", ) ], metadata_records=[MetadataRecord("channel", "internal", confirmed=True)], ) draft = registry.create_asset( "Draft", Classification(asset_type="guide", sensitivity=Sensitivity.INTERNAL, owner="Docs"), context, asset_id="asset-draft", metadata_records=[MetadataRecord("channel", "internal", confirmed=False)], ) registry.transition_lifecycle(draft.asset.id, LifecycleState.RETIRED, context) result = retrieval.query_assets( AssetQueryRequest( asset_type="guide", lifecycle=LifecycleState.ACTIVE, owner="Docs", topic="retrieval", review_state="approved", metadata_filters={"channel": "internal"}, confirmed_metadata_only=True, source_system="repo", source_path="docs/guide.md", representation_kind=RepresentationKind.NORMALIZED, ), context, ) assert [item.asset.id for item in result.items] == ["asset-guide"] assert result.items[0].representations[0].kind == RepresentationKind.NORMALIZED assert result.items[0].metadata_records[0].confirmed is True def test_asset_retrieval_invalid_query_returns_structured_diagnostics() -> None: retrieval = AssetRetrievalService(InMemoryAssetRegistryRepository()) result = retrieval.query_assets( AssetQueryRequest( lifecycle="missing", representation_kind="summary", sort_by="rank", sort_order="sideways", limit=0, offset=-1, ), operation_context(), ) assert result.success is False assert result.total == 0 assert result.items == () assert {diagnostic.code for diagnostic in result.diagnostics} == { "retrieval.lifecycle_invalid", "retrieval.representation_kind_invalid", "retrieval.sort_invalid", "retrieval.sort_order_invalid", "retrieval.limit_invalid", "retrieval.offset_invalid", } payload = result.to_dict() assert payload["success"] is False assert payload["diagnostics"][0]["severity"] == "error" def test_asset_retrieval_lexical_search_over_normalized_content(tmp_path: Path) -> None: alpha = tmp_path / "alpha.txt" beta = tmp_path / "beta.txt" alpha.write_text("alpha retrieval signal\nalpha again\n", encoding="utf-8") beta.write_text("beta only\n", encoding="utf-8") repo = InMemoryAssetRegistryRepository() ingestion = AssetIngestionService(repo) retrieval = AssetRetrievalService(repo) context = operation_context() ingestion.ingest_file(alpha, context, asset_id="asset-alpha") ingestion.ingest_file(beta, context, asset_id="asset-beta") refresh = retrieval.refresh_index() result = retrieval.query_assets(AssetQueryRequest(text="alpha"), context) zero = retrieval.query_assets(AssetQueryRequest(text="gamma"), context) assert refresh.indexed_assets == 2 assert refresh.indexed_representations == 2 assert [item.asset.id for item in result.items] == ["asset-alpha"] assert result.items[0].relevance["strategy"] == "lexical_substring" assert result.items[0].relevance["match_count"] == 2 assert result.items[0].relevance["representation_ids"] assert result.metadata["zero_result"] is False assert result.metadata["lexical_index"] == { "indexed_assets": 2, "indexed_representations": 2, } assert zero.total == 0 assert zero.metadata["zero_result"] is True def test_asset_retrieval_returns_permission_filtered_source_grounded_snippets() -> None: repo = InMemoryAssetRegistryRepository() registry = AssetRegistryService(repo) retrieval = AssetRetrievalService(repo, policy_gateway=DenyConfidentialRetrievalPolicy()) context = operation_context() public_source = SourceReference(source_system="repo", path="docs/public.md", checksum="sha256:public") confidential_source = SourceReference( source_system="repo", path="docs/confidential.md", checksum="sha256:confidential", ) registry.create_asset( "Public Snippet", Classification(asset_type="document", sensitivity=Sensitivity.PUBLIC), context, asset_id="asset-public-snippet", source_refs=[public_source], representations=[ AssetRepresentation.from_content( "asset-public-snippet", RepresentationKind.NORMALIZED, "text/markdown", "normalized public", source_ref_id=public_source.id, metadata={ "search_text": "Public alpha signal that should be citeable from the normalized document.", "extractor": "markitect-tool", "markitect_selector": "section:intro", }, ) ], ) registry.create_asset( "Confidential Snippet", Classification(asset_type="document", sensitivity=Sensitivity.CONFIDENTIAL), context, asset_id="asset-confidential-snippet", source_refs=[confidential_source], representations=[ AssetRepresentation.from_content( "asset-confidential-snippet", RepresentationKind.NORMALIZED, "text/markdown", "normalized confidential", source_ref_id=confidential_source.id, metadata={ "search_text": "Confidential alpha signal must not leak as a snippet.", "extractor": "markitect-tool", "markitect_selector": "section:private", }, ) ], ) retrieval.refresh_index() result = retrieval.query_assets( AssetQueryRequest(text="alpha", include_snippets=True, max_snippets=1, snippet_radius=12), context, ) assert [item.asset.id for item in result.items] == ["asset-public-snippet"] assert result.items[0].relevance["snippet_count"] == 1 snippet = result.items[0].snippets[0] assert snippet.asset_id == "asset-public-snippet" assert snippet.source_ref_id == public_source.id assert snippet.match_text == "alpha" assert "alpha" in snippet.text assert snippet.provenance["extractor"] == "markitect-tool" assert snippet.provenance["markitect_selector"] == "section:intro" assert result.to_dict()["results"][0]["snippets"][0]["source_ref_id"] == public_source.id retrieval_audit = [ event for event in repo.list_audit_events(correlation_id="corr-retrieval") if event.operation == "retrieval.assets.query" ][-1] assert retrieval_audit.details["permission_filtered_count"] == 1 def test_asset_retrieval_filters_are_backend_portable_with_sqlite(tmp_path: Path) -> None: repo = SQLiteAssetRegistryRepository(tmp_path / "registry.sqlite") registry = AssetRegistryService(repo) retrieval = AssetRetrievalService(repo) context = operation_context() source_ref = SourceReference(source_system="local_file", path="docs/report.txt", checksum="sha256:report") registry.create_asset( "Report", Classification( asset_type="report", sensitivity=Sensitivity.INTERNAL, owner="Analytics", topics=("retrieval", "quarterly"), review_state="approved", metadata={"collection": "reports"}, ), context, asset_id="asset-report", source_refs=[source_ref], representations=[ AssetRepresentation.from_content( "asset-report", RepresentationKind.NORMALIZED, "text/plain", "quarterly retrieval report", metadata={"search_text": "quarterly retrieval report"}, ) ], metadata_records=[ MetadataRecord("collection", "reports", confirmed=True), MetadataRecord("tags", ["finance", "retrieval"], confirmed=True), ], ) registry.create_asset( "Other", Classification(asset_type="report", sensitivity=Sensitivity.INTERNAL, owner="Analytics"), context, asset_id="asset-other", metadata_records=[MetadataRecord("collection", "misc", confirmed=True)], ) retrieval.refresh_index() result = retrieval.query_assets( AssetQueryRequest( text="retrieval", asset_type="report", sensitivity=Sensitivity.INTERNAL, owner="Analytics", tags=("finance", "retrieval"), collection="reports", source_system="local_file", source_path="docs/report.txt", metadata_filters={"collection": "reports"}, confirmed_metadata_only=True, ), context, ) assert [item.asset.id for item in result.items] == ["asset-report"] assert result.items[0].relevance["match_count"] == 1 assert result.items[0].metadata_records[0].confirmed is True def test_asset_retrieval_filters_by_context_entity_workflow_run_and_related_asset() -> None: repo = InMemoryAssetRegistryRepository() registry = AssetRegistryService(repo) retrieval = AssetRetrievalService(repo) context = operation_context() classification = Classification(asset_type="document", sensitivity=Sensitivity.INTERNAL) policy = registry.create_asset("Policy", classification, context, asset_id="asset-policy") implementation = registry.create_asset( "Implementation", classification, context, asset_id="asset-implementation", ) registry.create_asset("Unrelated", classification, context, asset_id="asset-unrelated") project = ContextEntity( entity_type=ContextEntityType.PROJECT, name="Kontextual Engine", external_ref="project:kontextual", metadata={"phase": "mvp"}, entity_id="entity-project-kontextual", ) workflow_run = ContextEntity( entity_type=ContextEntityType.WORKFLOW_RUN, name="Initial ingestion", external_ref="workflow-run-42", entity_id="entity-workflow-run-42", ) registry.link_asset_to_context_entity( policy.asset.id, project, "about_project", context, confidence=0.96, provenance={"producer": "test-fixture"}, ) registry.link_asset_to_context_entity( implementation.asset.id, workflow_run, "produced_by_run", context, ) registry.link_asset_to_asset( implementation.asset.id, policy.asset.id, "implements", context, confidence=0.88, provenance={"basis": "workplan"}, ) project_result = retrieval.query_assets( AssetQueryRequest( context_entity_type=ContextEntityType.PROJECT, context_entity_name="Kontextual Engine", relationship_predicate="about_project", ), context, ) related_result = retrieval.query_assets( AssetQueryRequest( related_asset_id=policy.asset.id, relationship_predicate="implements", ), context, ) workflow_result = retrieval.query_assets( AssetQueryRequest(workflow_run_id="workflow-run-42"), context, ) assert [item.asset.id for item in project_result.items] == ["asset-policy"] assert project_result.items[0].relationships[0].predicate == "about_project" assert project_result.items[0].relationships[0].confidence == 0.96 assert project_result.items[0].context_entities[0].entity_id == "entity-project-kontextual" assert project_result.to_dict()["results"][0]["relationships"][0]["provenance"]["producer"] == "test-fixture" assert [item.asset.id for item in related_result.items] == ["asset-implementation"] assert related_result.items[0].relationships[0].target_id == "asset-policy" assert [item.asset.id for item in workflow_result.items] == ["asset-implementation"] assert workflow_result.items[0].context_entities[0].entity_type == ContextEntityType.WORKFLOW_RUN def test_context_entity_and_relationship_queries_are_backend_portable_with_sqlite(tmp_path: Path) -> None: repo = SQLiteAssetRegistryRepository(tmp_path / "registry.sqlite") registry = AssetRegistryService(repo) retrieval = AssetRetrievalService(repo) context = operation_context() policy = registry.create_asset( "Policy", Classification(asset_type="policy", sensitivity=Sensitivity.INTERNAL), context, asset_id="asset-policy", ) case = ContextEntity( entity_type=ContextEntityType.CASE, name="Migration Case", external_ref="case:migration", metadata={"priority": "high"}, entity_id="entity-case-migration", ) linked = registry.link_asset_to_context_entity( policy.asset.id, case, "about_case", context, confidence=0.73, provenance={"source": "test"}, ) entities = retrieval.query_context_entities( ContextEntityQueryRequest( entity_type="case", external_ref="case:migration", metadata_filters={"priority": "high"}, ), context, ) relationships = retrieval.query_relationships( RelationshipQueryRequest( context_entity_id="entity-case-migration", predicate="about_case", target_kind="context_entity", ), context, ) assert entities.total == 1 assert entities.items[0].asset_ids == ("asset-policy",) assert entities.items[0].relationship_count == 1 assert relationships.total == 1 assert relationships.items[0].relationship.relationship_id == linked.relationship.relationship_id assert relationships.items[0].source_asset.id == "asset-policy" assert relationships.items[0].target_entity.entity_id == "entity-case-migration" assert relationships.to_dict()["results"][0]["confidence"] == 0.73 def test_graph_retrieval_invalid_queries_return_structured_diagnostics() -> None: retrieval = AssetRetrievalService(InMemoryAssetRegistryRepository()) context = operation_context() asset_result = retrieval.query_assets( AssetQueryRequest(context_entity_type="planet", relationship_direction="sideways"), context, ) entity_result = retrieval.query_context_entities( ContextEntityQueryRequest(entity_type="planet", sort_by="rank", limit=0), context, ) relationship_result = retrieval.query_relationships( RelationshipQueryRequest( target_kind="memory_phase", direction="sideways", sort_order="diagonal", offset=-1, ), context, ) assert asset_result.success is False assert {diagnostic.code for diagnostic in asset_result.diagnostics} == { "retrieval.context_entity_type_invalid", "retrieval.relationship_direction_invalid", } assert entity_result.success is False assert {diagnostic.code for diagnostic in entity_result.diagnostics} == { "retrieval.context_entity_type_invalid", "retrieval.sort_invalid", "retrieval.limit_invalid", } assert relationship_result.success is False assert {diagnostic.code for diagnostic in relationship_result.diagnostics} == { "retrieval.relationship_target_kind_invalid", "retrieval.relationship_direction_invalid", "retrieval.sort_order_invalid", "retrieval.offset_invalid", } def test_retrieval_policy_filters_assets_relationships_and_context_payloads() -> None: repo = InMemoryAssetRegistryRepository() registry = AssetRegistryService(repo) retrieval = AssetRetrievalService(repo, policy_gateway=DenyConfidentialRetrievalPolicy()) context = operation_context() public = registry.create_asset( "Public", Classification(asset_type="document", sensitivity=Sensitivity.PUBLIC), context, asset_id="asset-public", ) confidential = registry.create_asset( "Confidential", Classification(asset_type="document", sensitivity=Sensitivity.CONFIDENTIAL), context, asset_id="asset-confidential", ) case = ContextEntity( entity_type=ContextEntityType.CASE, name="Visible Case", entity_id="entity-visible-case", ) registry.link_asset_to_context_entity(public.asset.id, case, "about_case", context) registry.link_asset_to_context_entity(confidential.asset.id, case, "about_case", context) registry.link_asset_to_asset(public.asset.id, confidential.asset.id, "references", context) assets = retrieval.query_assets(AssetQueryRequest(sort_by="asset_id"), context) relationships = retrieval.query_relationships( RelationshipQueryRequest(predicate="references"), context, ) entities = retrieval.query_context_entities( ContextEntityQueryRequest(entity_id="entity-visible-case"), context, ) assert [item.asset.id for item in assets.items] == ["asset-public"] assert assets.total == 1 assert assets.metadata["policy_enforced"] is True assert relationships.total == 0 assert entities.items[0].asset_ids == ("asset-public",) assert entities.items[0].relationship_count == 1 audit_events = repo.list_audit_events(correlation_id="corr-retrieval") retrieval_audit = [event for event in audit_events if event.operation == "retrieval.assets.query"][-1] assert retrieval_audit.outcome.value == "partial" assert retrieval_audit.details["permission_filtered_count"] == 1 assert retrieval_audit.policy_decision.action == "retrieval.assets.query" def test_retrieval_scope_policy_fail_closed_returns_empty_denied_envelope() -> None: repo = InMemoryAssetRegistryRepository() registry = AssetRegistryService(repo) retrieval = AssetRetrievalService(repo, policy_gateway=BrokenRetrievalPolicy()) context = operation_context() registry.create_asset( "Public", Classification(asset_type="document", sensitivity=Sensitivity.PUBLIC), context, asset_id="asset-public", ) result = retrieval.query_assets(AssetQueryRequest(), context) assert result.success is False assert result.total == 0 assert result.items == () assert [diagnostic.code for diagnostic in result.diagnostics] == ["retrieval.permission_denied"] decision = result.diagnostics[0].details["policy_decision"] assert decision["effect"] == "fail_closed" audit_event = repo.list_audit_events(correlation_id="corr-retrieval")[-1] assert audit_event.operation == "retrieval.assets.query" assert audit_event.outcome.value == "denied" assert audit_event.policy_decision.effect.value == "fail_closed" def test_retrieval_feedback_persists_and_quality_metrics_use_feedback_and_audit(tmp_path: Path) -> None: db_path = tmp_path / "registry.sqlite" repo = SQLiteAssetRegistryRepository(db_path) registry = AssetRegistryService(repo) retrieval = AssetRetrievalService(repo) context = operation_context() source = SourceReference(source_system="repo", path="docs/feedback.md", checksum="sha256:feedback") registry.create_asset( "Feedback Source", Classification(asset_type="document", sensitivity=Sensitivity.PUBLIC), context, asset_id="asset-feedback", source_refs=[source], representations=[ AssetRepresentation.from_content( "asset-feedback", RepresentationKind.NORMALIZED, "text/markdown", "normalized feedback", source_ref_id=source.id, metadata={"search_text": "alpha feedback citation target", "extractor": "plain-text"}, ) ], ) retrieval.refresh_index() query_result = retrieval.query_assets( AssetQueryRequest(text="alpha", include_snippets=True), context, ) feedback = retrieval.record_feedback( RetrievalFeedbackRequest( label=RetrievalFeedbackLabel.USEFUL, query=query_result.request.to_dict(), result_ref={ "asset_id": "asset-feedback", "rank": 1, "representation_id": query_result.items[0].representations[0].representation_id, "source_ref_id": source.id, }, metadata={"citation": True}, ), context, ) reloaded = AssetRetrievalService(SQLiteAssetRegistryRepository(db_path)) records = reloaded.list_feedback(correlation_id="corr-retrieval") metrics = reloaded.quality_metrics(query_results=(query_result,), precision_at_k=1) assert feedback.success is True assert records[0].feedback_id == feedback.record.feedback_id assert records[0].query["text"] == "alpha" assert records[0].result_ref["asset_id"] == "asset-feedback" assert metrics.feedback_count == 1 assert metrics.useful_count == 1 assert metrics.zero_result_rate == 0.0 assert metrics.precision_at_k == 1.0 assert metrics.citation_precision == 1.0 assert metrics.permission_filter_observation_count >= 1 assert metrics.average_permission_filter_duration_ms is not None def test_retrieval_feedback_invalid_label_returns_structured_diagnostic() -> None: retrieval = AssetRetrievalService(InMemoryAssetRegistryRepository()) result = retrieval.record_feedback( RetrievalFeedbackRequest(label="maybe", query={}, result_ref={}), operation_context(), ) assert result.success is False assert result.record is None assert [diagnostic.code for diagnostic in result.diagnostics] == ["retrieval.feedback_label_invalid"] def operation_context() -> OperationContext: actor = Actor.create( ActorType.HUMAN, actor_id="user-retrieval", display_name="Retrieval Tester", groups=["engineering"], ) return OperationContext.create(actor, correlation_id="corr-retrieval") class DenyConfidentialRetrievalPolicy: def authorize( self, context: OperationContext, action: str, resource: str, *, resource_metadata: dict | None = None, ) -> PolicyDecision: resource_metadata = resource_metadata or {} if action == "asset.retrieve" and resource_metadata.get("sensitivity") == Sensitivity.CONFIDENTIAL.value: return PolicyDecision.fail_closed( context.actor.id, action, resource, reason="confidential retrieval denied in test policy", context={"resource_metadata": resource_metadata}, ) return PolicyDecision.allow( context.actor.id, action, resource, context={"resource_metadata": resource_metadata}, ) class BrokenRetrievalPolicy: def authorize( self, context: OperationContext, action: str, resource: str, *, resource_metadata: dict | None = None, ) -> PolicyDecision: if action == "retrieval.assets.query": raise RuntimeError("policy context unavailable") return PolicyDecision.allow(context.actor.id, action, resource)