generated from coulomb/repo-seed
content-addressed blob storage: blob_storage.py, memory, local, and S3 adapters
This commit is contained in:
@@ -83,6 +83,7 @@ def test_cmis_browser_binding_routes_are_advertised_in_openapi(cmis_client) -> N
|
||||
assert "/cmis/{access_point_id}/browser/children" in paths
|
||||
assert "/cmis/{access_point_id}/browser/object/{object_id}" in paths
|
||||
assert "/cmis/{access_point_id}/browser/content/{object_id}" in paths
|
||||
assert "/cmis/{access_point_id}/browser/content-bytes/{object_id}" in paths
|
||||
assert "/cmis/{access_point_id}/browser/acl/{object_id}" in paths
|
||||
assert "/cmis/{access_point_id}/browser/parents/{object_id}" in paths
|
||||
assert "/cmis/{access_point_id}/browser/query" in paths
|
||||
@@ -184,6 +185,9 @@ def test_cmis_governed_authoring_routes_allow_selected_mutations(cmis_client) ->
|
||||
"/cmis/governed-authoring/browser/object/cmis:asset:asset-api-authored/content",
|
||||
json={"content": "# Updated", "media_type": "text/markdown"},
|
||||
)
|
||||
byte_stream = cmis_client.get(
|
||||
"/cmis/governed-authoring/browser/content-bytes/cmis:asset:asset-api-authored",
|
||||
)
|
||||
deleted = cmis_client.post(
|
||||
"/cmis/governed-authoring/browser/object/cmis:asset:asset-api-authored/delete",
|
||||
json={},
|
||||
@@ -192,6 +196,8 @@ def test_cmis_governed_authoring_routes_allow_selected_mutations(cmis_client) ->
|
||||
assert created.status_code == 200
|
||||
assert updated.json()["properties"]["kontextual:metadata:status"] == "draft"
|
||||
assert streamed.json()["content_stream"]["mime_type"] == "text/markdown"
|
||||
assert byte_stream.content == b"# Updated"
|
||||
assert byte_stream.headers["etag"].startswith("sha256:")
|
||||
assert deleted.json()["lifecycle"] == "delete_requested"
|
||||
|
||||
|
||||
|
||||
@@ -164,6 +164,11 @@ def test_runtime_cmis_governed_authoring_allows_selected_mutations(cmis_runtime)
|
||||
{"content": "# Authored\n\nUpdated stream.", "media_type": "text/markdown"},
|
||||
context,
|
||||
)
|
||||
stream_bytes = runtime.cmis_content_stream_bytes(
|
||||
"governed-authoring",
|
||||
"cmis:asset:asset-authored",
|
||||
context,
|
||||
)
|
||||
deleted = runtime.cmis_delete_object(
|
||||
"governed-authoring",
|
||||
"cmis:asset:asset-authored",
|
||||
@@ -174,6 +179,8 @@ def test_runtime_cmis_governed_authoring_allows_selected_mutations(cmis_runtime)
|
||||
assert created["object_id"] == "cmis:asset:asset-authored"
|
||||
assert updated["properties"]["kontextual:metadata:reviewer"] == "codex"
|
||||
assert streamed["content_stream"]["mime_type"] == "text/markdown"
|
||||
assert b"".join(stream_bytes.chunks) == b"# Authored\n\nUpdated stream."
|
||||
assert stream_bytes.representation.storage_ref.startswith("blob://memory/")
|
||||
assert deleted["deleted"] is False
|
||||
assert deleted["lifecycle"] == "delete_requested"
|
||||
|
||||
|
||||
112
tests/test_blob_storage.py
Normal file
112
tests/test_blob_storage.py
Normal file
@@ -0,0 +1,112 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
from kontextual_engine import InMemoryBlobStorage, LocalBlobStorage, S3BlobStorage, content_digest
|
||||
|
||||
|
||||
def test_memory_blob_storage_deduplicates_by_digest() -> None:
|
||||
storage = InMemoryBlobStorage()
|
||||
first = storage.put_bytes(b"same content", media_type="text/plain")
|
||||
second = storage.put_bytes(b"same content", media_type="text/plain")
|
||||
|
||||
assert first.created is True
|
||||
assert second.created is False
|
||||
assert first.blob.storage_ref == second.blob.storage_ref
|
||||
assert storage.read_bytes(first.blob.storage_ref) == b"same content"
|
||||
assert b"".join(storage.iter_bytes(first.blob.storage_ref, chunk_size=4)) == b"same content"
|
||||
assert storage.exists(first.blob.digest) is True
|
||||
assert len(storage.iter_blobs()) == 1
|
||||
|
||||
|
||||
def test_local_blob_storage_stores_one_file_for_duplicate_content(tmp_path) -> None:
|
||||
storage = LocalBlobStorage(tmp_path / "blobs")
|
||||
first = storage.put_bytes(b"local content", media_type="text/plain")
|
||||
second = storage.put_bytes(b"local content", media_type="text/plain")
|
||||
|
||||
assert first.created is True
|
||||
assert second.created is False
|
||||
assert first.blob.storage_ref == second.blob.storage_ref
|
||||
assert storage.read_bytes(first.blob.storage_ref) == b"local content"
|
||||
assert b"".join(storage.iter_bytes(first.blob.storage_ref, chunk_size=5)) == b"local content"
|
||||
assert len(storage.iter_blobs()) == 1
|
||||
assert len([path for path in (tmp_path / "blobs").rglob("*") if path.is_file()]) == 1
|
||||
|
||||
|
||||
def test_blob_cleanup_dry_run_and_delete(tmp_path) -> None:
|
||||
storage = LocalBlobStorage(tmp_path / "blobs")
|
||||
kept = storage.put_bytes(b"kept").blob
|
||||
orphan = storage.put_bytes(b"orphan").blob
|
||||
|
||||
dry_run = storage.delete_unreferenced({kept.storage_ref}, dry_run=True)
|
||||
deleted = storage.delete_unreferenced({kept.storage_ref}, dry_run=False)
|
||||
|
||||
assert dry_run.deleted_count == 1
|
||||
assert dry_run.reclaimable_bytes == len(b"orphan")
|
||||
assert orphan.storage_ref in dry_run.deleted_storage_refs
|
||||
assert deleted.deleted_storage_refs == (orphan.storage_ref,)
|
||||
assert storage.exists(kept.storage_ref) is True
|
||||
assert storage.exists(orphan.storage_ref) is False
|
||||
|
||||
|
||||
def test_s3_blob_storage_uses_content_addressed_keys_with_fake_client() -> None:
|
||||
client = FakeS3Client()
|
||||
storage = S3BlobStorage(bucket="test-bucket", prefix="kontextual", client=client)
|
||||
|
||||
first = storage.put_bytes(b"s3 content", media_type="text/plain")
|
||||
second = storage.put_bytes(b"s3 content", media_type="text/plain")
|
||||
readback = storage.read_bytes(first.blob.storage_ref)
|
||||
listed = storage.iter_blobs()
|
||||
|
||||
assert first.created is True
|
||||
assert second.created is False
|
||||
assert first.blob.storage_ref == second.blob.storage_ref
|
||||
assert first.blob.digest == content_digest(b"s3 content")
|
||||
assert first.blob.storage_key.startswith("kontextual/sha256/")
|
||||
assert readback == b"s3 content"
|
||||
assert b"".join(storage.iter_bytes(first.blob.storage_ref, chunk_size=2)) == b"s3 content"
|
||||
assert [item.storage_ref for item in listed] == [first.blob.storage_ref]
|
||||
assert client.put_count == 1
|
||||
|
||||
|
||||
class FakeS3Client:
|
||||
def __init__(self) -> None:
|
||||
self.objects: dict[tuple[str, str], dict] = {}
|
||||
self.put_count = 0
|
||||
|
||||
def head_object(self, *, Bucket: str, Key: str) -> dict:
|
||||
try:
|
||||
item = self.objects[(Bucket, Key)]
|
||||
except KeyError as exc:
|
||||
raise FakeS3NotFound() from exc
|
||||
return {
|
||||
"ContentLength": len(item["Body"]),
|
||||
"ContentType": item.get("ContentType"),
|
||||
"Metadata": item.get("Metadata", {}),
|
||||
}
|
||||
|
||||
def put_object(self, **kwargs) -> None:
|
||||
self.put_count += 1
|
||||
self.objects[(kwargs["Bucket"], kwargs["Key"])] = kwargs
|
||||
|
||||
def get_object(self, *, Bucket: str, Key: str) -> dict:
|
||||
try:
|
||||
item = self.objects[(Bucket, Key)]
|
||||
except KeyError as exc:
|
||||
raise FakeS3NotFound() from exc
|
||||
return {"Body": BytesIO(item["Body"])}
|
||||
|
||||
def list_objects_v2(self, *, Bucket: str, Prefix: str, ContinuationToken: str | None = None) -> dict:
|
||||
contents = [
|
||||
{"Key": key, "Size": len(item["Body"])}
|
||||
for (bucket, key), item in sorted(self.objects.items())
|
||||
if bucket == Bucket and key.startswith(Prefix)
|
||||
]
|
||||
return {"Contents": contents, "IsTruncated": False}
|
||||
|
||||
def delete_object(self, *, Bucket: str, Key: str) -> None:
|
||||
self.objects.pop((Bucket, Key), None)
|
||||
|
||||
|
||||
class FakeS3NotFound(Exception):
|
||||
response = {"Error": {"Code": "NoSuchKey"}, "ResponseMetadata": {"HTTPStatusCode": 404}}
|
||||
216
tests/test_representation_content_service.py
Normal file
216
tests/test_representation_content_service.py
Normal file
@@ -0,0 +1,216 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from kontextual_engine import (
|
||||
Actor,
|
||||
ActorType,
|
||||
AssetRegistryService,
|
||||
AuthorizationError,
|
||||
Classification,
|
||||
InMemoryAssetRegistryRepository,
|
||||
InMemoryBlobStorage,
|
||||
LocalBlobStorage,
|
||||
OperationContext,
|
||||
PolicyDecision,
|
||||
RepresentationContentService,
|
||||
RepresentationKind,
|
||||
Sensitivity,
|
||||
ValidationError,
|
||||
)
|
||||
|
||||
|
||||
def test_content_service_adds_representation_bytes_with_deduplicated_blob() -> None:
|
||||
repo = InMemoryAssetRegistryRepository()
|
||||
blobs = InMemoryBlobStorage()
|
||||
context = operation_context()
|
||||
AssetRegistryService(repo).create_asset(
|
||||
"Content Asset",
|
||||
Classification(asset_type="document", sensitivity=Sensitivity.INTERNAL),
|
||||
context,
|
||||
asset_id="asset-content",
|
||||
)
|
||||
service = RepresentationContentService(repo, blobs)
|
||||
|
||||
first = service.add_representation_from_bytes(
|
||||
"asset-content",
|
||||
RepresentationKind.SOURCE,
|
||||
"text/plain",
|
||||
b"same bytes",
|
||||
context,
|
||||
)
|
||||
second = service.add_representation_from_bytes(
|
||||
"asset-content",
|
||||
RepresentationKind.DERIVED,
|
||||
"text/plain",
|
||||
b"same bytes",
|
||||
context,
|
||||
)
|
||||
|
||||
representations = repo.list_representations(asset_id="asset-content")
|
||||
assert len(representations) == 2
|
||||
assert representations[0].storage_ref == representations[1].storage_ref
|
||||
assert first.version.sequence == 2
|
||||
assert second.version.sequence == 3
|
||||
assert len(blobs.iter_blobs()) == 1
|
||||
|
||||
|
||||
def test_content_service_reads_bytes_with_policy_and_audit() -> None:
|
||||
repo = InMemoryAssetRegistryRepository()
|
||||
blobs = InMemoryBlobStorage()
|
||||
context = operation_context()
|
||||
AssetRegistryService(repo).create_asset(
|
||||
"Readable",
|
||||
Classification(asset_type="document", sensitivity=Sensitivity.INTERNAL),
|
||||
context,
|
||||
asset_id="asset-readable",
|
||||
)
|
||||
service = RepresentationContentService(repo, blobs)
|
||||
service.add_representation_from_bytes(
|
||||
"asset-readable",
|
||||
RepresentationKind.SOURCE,
|
||||
"text/plain",
|
||||
b"read me",
|
||||
context,
|
||||
)
|
||||
|
||||
content = service.get_content_stream("asset-readable", context)
|
||||
streamed = service.stream_content("asset-readable", context, chunk_size=3)
|
||||
|
||||
assert content.content == b"read me"
|
||||
assert b"".join(streamed.chunks) == b"read me"
|
||||
assert content.representation.media_type == "text/plain"
|
||||
assert content.blob.storage_ref == content.representation.storage_ref
|
||||
assert content.audit_event.operation == "asset.content_stream.read"
|
||||
assert repo.list_audit_events(target="asset:asset-readable")[-1].operation == "asset.content_stream.read"
|
||||
|
||||
|
||||
def test_content_service_reads_source_normalized_and_derived_by_kind() -> None:
|
||||
repo = InMemoryAssetRegistryRepository()
|
||||
blobs = InMemoryBlobStorage()
|
||||
context = operation_context()
|
||||
AssetRegistryService(repo).create_asset(
|
||||
"Kinds",
|
||||
Classification(asset_type="document", sensitivity=Sensitivity.INTERNAL),
|
||||
context,
|
||||
asset_id="asset-kinds",
|
||||
)
|
||||
service = RepresentationContentService(repo, blobs)
|
||||
service.add_representation_from_bytes("asset-kinds", RepresentationKind.SOURCE, "text/plain", b"source", context)
|
||||
service.add_representation_from_bytes(
|
||||
"asset-kinds",
|
||||
RepresentationKind.NORMALIZED,
|
||||
"text/plain",
|
||||
b"normalized",
|
||||
context,
|
||||
)
|
||||
service.add_representation_from_bytes("asset-kinds", RepresentationKind.DERIVED, "text/plain", b"derived", context)
|
||||
|
||||
assert service.get_content_stream("asset-kinds", context, kind=RepresentationKind.SOURCE).content == b"source"
|
||||
assert service.get_content_stream("asset-kinds", context, kind=RepresentationKind.NORMALIZED).content == b"normalized"
|
||||
assert service.get_content_stream("asset-kinds", context, kind=RepresentationKind.DERIVED).content == b"derived"
|
||||
|
||||
|
||||
def test_content_service_denies_bytes_before_exposure() -> None:
|
||||
repo = InMemoryAssetRegistryRepository()
|
||||
blobs = InMemoryBlobStorage()
|
||||
context = operation_context()
|
||||
AssetRegistryService(repo).create_asset(
|
||||
"Denied",
|
||||
Classification(asset_type="document", sensitivity=Sensitivity.INTERNAL),
|
||||
context,
|
||||
asset_id="asset-denied-stream",
|
||||
)
|
||||
writer = RepresentationContentService(repo, blobs)
|
||||
writer.add_representation_from_bytes(
|
||||
"asset-denied-stream",
|
||||
RepresentationKind.SOURCE,
|
||||
"text/plain",
|
||||
b"secret",
|
||||
context,
|
||||
)
|
||||
reader = RepresentationContentService(repo, blobs, policy_gateway=DenyContentPolicy())
|
||||
|
||||
with pytest.raises(AuthorizationError):
|
||||
reader.get_content_stream("asset-denied-stream", context)
|
||||
|
||||
|
||||
def test_content_service_cleanup_uses_repository_references() -> None:
|
||||
repo = InMemoryAssetRegistryRepository()
|
||||
blobs = InMemoryBlobStorage()
|
||||
context = operation_context()
|
||||
AssetRegistryService(repo).create_asset(
|
||||
"Cleanup",
|
||||
Classification(asset_type="document", sensitivity=Sensitivity.INTERNAL),
|
||||
context,
|
||||
asset_id="asset-cleanup",
|
||||
)
|
||||
service = RepresentationContentService(repo, blobs)
|
||||
service.add_representation_from_bytes(
|
||||
"asset-cleanup",
|
||||
RepresentationKind.SOURCE,
|
||||
"text/plain",
|
||||
b"kept",
|
||||
context,
|
||||
)
|
||||
orphan = blobs.put_bytes(b"orphan").blob
|
||||
|
||||
cleanup = service.cleanup_unreferenced_blobs(dry_run=True)
|
||||
|
||||
assert cleanup.deleted_count == 1
|
||||
assert cleanup.deleted_storage_refs == (orphan.storage_ref,)
|
||||
assert cleanup.reclaimable_bytes == len(b"orphan")
|
||||
|
||||
|
||||
def test_content_service_detects_corrupted_stored_content(tmp_path) -> None:
|
||||
repo = InMemoryAssetRegistryRepository()
|
||||
blobs = LocalBlobStorage(tmp_path / "blobs")
|
||||
context = operation_context()
|
||||
AssetRegistryService(repo).create_asset(
|
||||
"Corrupt",
|
||||
Classification(asset_type="document", sensitivity=Sensitivity.INTERNAL),
|
||||
context,
|
||||
asset_id="asset-corrupt",
|
||||
)
|
||||
service = RepresentationContentService(repo, blobs)
|
||||
service.add_representation_from_bytes(
|
||||
"asset-corrupt",
|
||||
RepresentationKind.SOURCE,
|
||||
"text/plain",
|
||||
b"expected",
|
||||
context,
|
||||
)
|
||||
representation = repo.list_representations(asset_id="asset-corrupt")[0]
|
||||
path = blobs.root / representation.storage_ref.removeprefix("blob://local/")
|
||||
path.write_bytes(b"corrupted")
|
||||
|
||||
with pytest.raises(ValidationError):
|
||||
service.get_content_stream("asset-corrupt", context)
|
||||
with pytest.raises(ValidationError):
|
||||
b"".join(service.stream_content("asset-corrupt", context).chunks)
|
||||
|
||||
|
||||
def operation_context() -> OperationContext:
|
||||
return OperationContext.create(
|
||||
Actor.create(ActorType.HUMAN, actor_id="content-test"),
|
||||
correlation_id="corr-content",
|
||||
)
|
||||
|
||||
|
||||
class DenyContentPolicy:
|
||||
def authorize(
|
||||
self,
|
||||
context: OperationContext,
|
||||
action: str,
|
||||
resource: str,
|
||||
*,
|
||||
resource_metadata: dict[str, str] | None = None,
|
||||
) -> PolicyDecision:
|
||||
if action == "asset.content_stream.read":
|
||||
return PolicyDecision.deny(
|
||||
context.actor.id,
|
||||
action,
|
||||
resource,
|
||||
reason="content reads disabled",
|
||||
)
|
||||
return PolicyDecision.allow(context.actor.id, action, resource)
|
||||
@@ -658,7 +658,9 @@ def test_service_health_readiness_version_and_openapi_contracts(client) -> None:
|
||||
assert "/cmis/{access_point_id}/browser" in paths
|
||||
assert "/cmis/{access_point_id}/browser/children" in paths
|
||||
assert "/cmis/{access_point_id}/browser/acl/{object_id}" in paths
|
||||
assert "/cmis/{access_point_id}/browser/content-bytes/{object_id}" in paths
|
||||
assert "/cmis/{access_point_id}/browser/parents/{object_id}" in paths
|
||||
assert "/api/v1/assets/{asset_id}/representations/{representation_id}/content" in paths
|
||||
assert "/cmis/{access_point_id}/browser/document" in paths
|
||||
assert "/cmis/{access_point_id}/browser/object/{object_id}/properties" in paths
|
||||
assert "/api/v1/assets" in paths
|
||||
|
||||
Reference in New Issue
Block a user