content-addressed blob storage: blob_storage.py, memory, local, and S3 adapters

This commit is contained in:
2026-05-07 03:51:25 +02:00
parent c2bc7071d7
commit ebace73761
22 changed files with 1489 additions and 47 deletions

View File

@@ -5,6 +5,7 @@ from .asset_service import (
AssetRegistryService,
RelationshipChangeResult,
)
from .content_service import RepresentationContentResult, RepresentationContentStream, RepresentationContentService
from .ingestion_service import AssetIngestionResult, AssetIngestionService
from .retrieval_service import (
AssetQueryItem,
@@ -53,6 +54,9 @@ __all__ = [
"ContextEntityQueryResult",
"LexicalIndexRefreshResult",
"RelationshipChangeResult",
"RepresentationContentResult",
"RepresentationContentStream",
"RepresentationContentService",
"RelationshipQueryItem",
"RelationshipQueryRequest",
"RelationshipQueryResult",

View File

@@ -0,0 +1,383 @@
"""Governed representation byte storage and streaming service."""
from __future__ import annotations
import hashlib
from collections.abc import Iterable, Iterator
from dataclasses import dataclass
from typing import Any
from kontextual_engine.core import (
AssetRepresentation,
AuditEvent,
AuditOutcome,
OperationContext,
PolicyDecision,
RepresentationKind,
new_id,
)
from kontextual_engine.errors import AuthorizationError, NotFoundError, ValidationError
from kontextual_engine.ports import (
AllowAllPolicyGateway,
AssetRegistryRepository,
BlobCleanupResult,
BlobRef,
BlobStorage,
PolicyGateway,
)
from kontextual_engine.services.asset_service import AssetChangeResult, AssetRegistryService
@dataclass(frozen=True)
class RepresentationContentResult:
representation: AssetRepresentation
content: bytes
blob: BlobRef
policy_decision: PolicyDecision
audit_event: AuditEvent
def to_dict(self, *, include_content: bool = False) -> dict[str, Any]:
data = {
"representation": self.representation.to_dict(),
"content_stream": {
"representation_id": self.representation.representation_id,
"asset_id": self.representation.asset_id,
"media_type": self.representation.media_type,
"digest": self.representation.digest,
"size_bytes": self.representation.size_bytes,
"storage_ref": self.representation.storage_ref,
"blob": self.blob.to_dict(),
},
"policy_decision": self.policy_decision.to_dict(),
"audit_event": self.audit_event.to_dict(),
}
if include_content:
data["content"] = self.content
return data
@dataclass(frozen=True)
class RepresentationContentStream:
representation: AssetRepresentation
chunks: Iterable[bytes]
blob: BlobRef
policy_decision: PolicyDecision
audit_event: AuditEvent
class RepresentationContentService:
def __init__(
self,
repository: AssetRegistryRepository,
blob_storage: BlobStorage,
*,
policy_gateway: PolicyGateway | None = None,
asset_service: AssetRegistryService | None = None,
) -> None:
self.repository = repository
self.blob_storage = blob_storage
self.policy_gateway = policy_gateway or AllowAllPolicyGateway()
self.asset_service = asset_service or AssetRegistryService(
repository,
policy_gateway=self.policy_gateway,
)
def build_representation_from_bytes(
self,
asset_id: str,
kind: RepresentationKind | str,
media_type: str,
content: str | bytes,
*,
producer: str | None = None,
source_ref_id: str | None = None,
metadata: dict[str, Any] | None = None,
representation_id: str | None = None,
) -> tuple[AssetRepresentation, BlobRef, bool]:
data = content.encode("utf-8") if isinstance(content, str) else bytes(content)
write = self.blob_storage.put_bytes(data, media_type=media_type)
blob = write.blob
representation = AssetRepresentation(
asset_id=asset_id,
kind=RepresentationKind(kind),
media_type=media_type,
digest=blob.digest,
size_bytes=blob.size_bytes,
storage_ref=blob.storage_ref,
producer=producer,
source_ref_id=source_ref_id,
metadata={"blob_adapter": blob.adapter, **dict(metadata or {})},
representation_id=representation_id or new_id("repr"),
)
return representation, blob, write.created
def add_representation_from_bytes(
self,
asset_id: str,
kind: RepresentationKind | str,
media_type: str,
content: str | bytes,
context: OperationContext,
*,
expected_current_version_id: str | None = None,
producer: str | None = None,
source_ref_id: str | None = None,
metadata: dict[str, Any] | None = None,
) -> AssetChangeResult:
representation, _blob, _created = self.build_representation_from_bytes(
asset_id,
kind,
media_type,
content,
producer=producer,
source_ref_id=source_ref_id,
metadata=metadata,
)
return self.asset_service.add_representation(
asset_id,
representation,
context,
expected_current_version_id=expected_current_version_id,
)
def get_content_stream(
self,
asset_id: str,
context: OperationContext,
*,
representation_id: str | None = None,
kind: RepresentationKind | str | None = None,
) -> RepresentationContentResult:
asset = self.repository.get_asset(asset_id)
representation = self._representation(asset_id, representation_id=representation_id, kind=kind)
if not representation.storage_ref:
raise NotFoundError(
"Representation content is not available in blob storage",
details={"asset_id": asset_id, "representation_id": representation.representation_id},
)
decision = self._authorize(
context,
"asset.content_stream.read",
f"asset:{asset.id}",
resource_metadata={
"representation_id": representation.representation_id,
"digest": representation.digest,
"media_type": representation.media_type,
},
)
try:
blob = self.blob_storage.stat(representation.storage_ref)
content = self.blob_storage.read_bytes(representation.storage_ref)
except ValueError as exc:
raise NotFoundError(
"Representation content is not available in configured blob storage",
details={
"asset_id": asset_id,
"representation_id": representation.representation_id,
"storage_ref": representation.storage_ref,
},
) from exc
if blob.digest != representation.digest:
raise ValidationError(
"Representation digest does not match stored blob",
details={
"representation_id": representation.representation_id,
"representation_digest": representation.digest,
"blob_digest": blob.digest,
},
)
actual_digest = "sha256:" + hashlib.sha256(content).hexdigest()
if actual_digest != representation.digest:
raise ValidationError(
"Representation content does not match expected digest",
details={
"representation_id": representation.representation_id,
"representation_digest": representation.digest,
"actual_digest": actual_digest,
},
)
event = self._audit(
"asset.content_stream.read",
f"asset:{asset.id}",
AuditOutcome.SUCCESS,
context,
decision,
details={"representation_id": representation.representation_id, "digest": representation.digest},
)
return RepresentationContentResult(representation, content, blob, decision, event)
def stream_content(
self,
asset_id: str,
context: OperationContext,
*,
representation_id: str | None = None,
kind: RepresentationKind | str | None = None,
chunk_size: int = 65536,
) -> RepresentationContentStream:
asset = self.repository.get_asset(asset_id)
representation = self._representation(asset_id, representation_id=representation_id, kind=kind)
if not representation.storage_ref:
raise NotFoundError(
"Representation content is not available in blob storage",
details={"asset_id": asset_id, "representation_id": representation.representation_id},
)
decision = self._authorize(
context,
"asset.content_stream.read",
f"asset:{asset.id}",
resource_metadata={
"representation_id": representation.representation_id,
"digest": representation.digest,
"media_type": representation.media_type,
},
)
try:
blob = self.blob_storage.stat(representation.storage_ref)
except ValueError as exc:
raise NotFoundError(
"Representation content is not available in configured blob storage",
details={
"asset_id": asset_id,
"representation_id": representation.representation_id,
"storage_ref": representation.storage_ref,
},
) from exc
if blob.digest != representation.digest:
raise ValidationError(
"Representation digest does not match stored blob",
details={
"representation_id": representation.representation_id,
"representation_digest": representation.digest,
"blob_digest": blob.digest,
},
)
event = self._audit(
"asset.content_stream.read",
f"asset:{asset.id}",
AuditOutcome.SUCCESS,
context,
decision,
details={"representation_id": representation.representation_id, "digest": representation.digest},
)
return RepresentationContentStream(
representation,
self._verified_chunks(representation, chunk_size=chunk_size),
blob,
decision,
event,
)
def referenced_storage_refs(self) -> set[str]:
return {
representation.storage_ref
for representation in self.repository.list_representations()
if representation.storage_ref
}
def cleanup_unreferenced_blobs(self, *, dry_run: bool = True) -> BlobCleanupResult:
return self.blob_storage.delete_unreferenced(self.referenced_storage_refs(), dry_run=dry_run)
def _representation(
self,
asset_id: str,
*,
representation_id: str | None,
kind: RepresentationKind | str | None,
) -> AssetRepresentation:
if representation_id:
representation = self.repository.get_representation(representation_id)
if representation.asset_id != asset_id:
raise NotFoundError(
"Representation not found for asset",
details={"asset_id": asset_id, "representation_id": representation_id},
)
return representation
parsed_kind = RepresentationKind(kind) if kind else None
representations = self.repository.list_representations(asset_id=asset_id, kind=parsed_kind)
if not representations:
raise NotFoundError("Representation not found", details={"asset_id": asset_id, "kind": kind})
priority = {
RepresentationKind.SOURCE: 0,
RepresentationKind.NORMALIZED: 1,
RepresentationKind.DERIVED: 2,
}
best_priority = min(priority.get(item.kind, 99) for item in representations)
candidates = [item for item in representations if priority.get(item.kind, 99) == best_priority]
return sorted(candidates, key=lambda item: (item.created_at, item.representation_id), reverse=True)[0]
def _verified_chunks(
self,
representation: AssetRepresentation,
*,
chunk_size: int,
) -> Iterator[bytes]:
if not representation.storage_ref:
raise NotFoundError(
"Representation content is not available in blob storage",
details={"asset_id": representation.asset_id, "representation_id": representation.representation_id},
)
hasher = hashlib.sha256()
for chunk in self.blob_storage.iter_bytes(representation.storage_ref, chunk_size=chunk_size):
hasher.update(chunk)
yield chunk
actual_digest = "sha256:" + hasher.hexdigest()
if actual_digest != representation.digest:
raise ValidationError(
"Representation content does not match expected digest",
details={
"representation_id": representation.representation_id,
"representation_digest": representation.digest,
"actual_digest": actual_digest,
},
)
def _authorize(
self,
context: OperationContext,
action: str,
resource: str,
*,
resource_metadata: dict[str, str] | None = None,
) -> PolicyDecision:
self.repository.save_actor(context.actor)
decision = self.policy_gateway.authorize(
context,
action,
resource,
resource_metadata=resource_metadata,
)
if not decision.allowed:
event = self._audit(action, resource, AuditOutcome.DENIED, context, decision)
raise AuthorizationError(
"Operation denied by policy",
details={
"action": action,
"resource": resource,
"correlation_id": context.correlation_id,
"audit_event_id": event.event_id,
"policy_decision": decision.to_dict(),
},
)
return decision
def _audit(
self,
operation: str,
target: str,
outcome: AuditOutcome,
context: OperationContext,
policy_decision: PolicyDecision,
*,
details: dict[str, Any] | None = None,
) -> AuditEvent:
return self.repository.save_audit_event(
AuditEvent.from_context(
operation,
target,
outcome,
context,
policy_decision=policy_decision,
details=details,
)
)