generated from coulomb/repo-seed
content-addressed blob storage: blob_storage.py, memory, local, and S3 adapters
This commit is contained in:
@@ -5,6 +5,7 @@ from .asset_service import (
|
||||
AssetRegistryService,
|
||||
RelationshipChangeResult,
|
||||
)
|
||||
from .content_service import RepresentationContentResult, RepresentationContentStream, RepresentationContentService
|
||||
from .ingestion_service import AssetIngestionResult, AssetIngestionService
|
||||
from .retrieval_service import (
|
||||
AssetQueryItem,
|
||||
@@ -53,6 +54,9 @@ __all__ = [
|
||||
"ContextEntityQueryResult",
|
||||
"LexicalIndexRefreshResult",
|
||||
"RelationshipChangeResult",
|
||||
"RepresentationContentResult",
|
||||
"RepresentationContentStream",
|
||||
"RepresentationContentService",
|
||||
"RelationshipQueryItem",
|
||||
"RelationshipQueryRequest",
|
||||
"RelationshipQueryResult",
|
||||
|
||||
383
src/kontextual_engine/services/content_service.py
Normal file
383
src/kontextual_engine/services/content_service.py
Normal file
@@ -0,0 +1,383 @@
|
||||
"""Governed representation byte storage and streaming service."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
from collections.abc import Iterable, Iterator
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
from kontextual_engine.core import (
|
||||
AssetRepresentation,
|
||||
AuditEvent,
|
||||
AuditOutcome,
|
||||
OperationContext,
|
||||
PolicyDecision,
|
||||
RepresentationKind,
|
||||
new_id,
|
||||
)
|
||||
from kontextual_engine.errors import AuthorizationError, NotFoundError, ValidationError
|
||||
from kontextual_engine.ports import (
|
||||
AllowAllPolicyGateway,
|
||||
AssetRegistryRepository,
|
||||
BlobCleanupResult,
|
||||
BlobRef,
|
||||
BlobStorage,
|
||||
PolicyGateway,
|
||||
)
|
||||
from kontextual_engine.services.asset_service import AssetChangeResult, AssetRegistryService
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RepresentationContentResult:
|
||||
representation: AssetRepresentation
|
||||
content: bytes
|
||||
blob: BlobRef
|
||||
policy_decision: PolicyDecision
|
||||
audit_event: AuditEvent
|
||||
|
||||
def to_dict(self, *, include_content: bool = False) -> dict[str, Any]:
|
||||
data = {
|
||||
"representation": self.representation.to_dict(),
|
||||
"content_stream": {
|
||||
"representation_id": self.representation.representation_id,
|
||||
"asset_id": self.representation.asset_id,
|
||||
"media_type": self.representation.media_type,
|
||||
"digest": self.representation.digest,
|
||||
"size_bytes": self.representation.size_bytes,
|
||||
"storage_ref": self.representation.storage_ref,
|
||||
"blob": self.blob.to_dict(),
|
||||
},
|
||||
"policy_decision": self.policy_decision.to_dict(),
|
||||
"audit_event": self.audit_event.to_dict(),
|
||||
}
|
||||
if include_content:
|
||||
data["content"] = self.content
|
||||
return data
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RepresentationContentStream:
|
||||
representation: AssetRepresentation
|
||||
chunks: Iterable[bytes]
|
||||
blob: BlobRef
|
||||
policy_decision: PolicyDecision
|
||||
audit_event: AuditEvent
|
||||
|
||||
|
||||
class RepresentationContentService:
|
||||
def __init__(
|
||||
self,
|
||||
repository: AssetRegistryRepository,
|
||||
blob_storage: BlobStorage,
|
||||
*,
|
||||
policy_gateway: PolicyGateway | None = None,
|
||||
asset_service: AssetRegistryService | None = None,
|
||||
) -> None:
|
||||
self.repository = repository
|
||||
self.blob_storage = blob_storage
|
||||
self.policy_gateway = policy_gateway or AllowAllPolicyGateway()
|
||||
self.asset_service = asset_service or AssetRegistryService(
|
||||
repository,
|
||||
policy_gateway=self.policy_gateway,
|
||||
)
|
||||
|
||||
def build_representation_from_bytes(
|
||||
self,
|
||||
asset_id: str,
|
||||
kind: RepresentationKind | str,
|
||||
media_type: str,
|
||||
content: str | bytes,
|
||||
*,
|
||||
producer: str | None = None,
|
||||
source_ref_id: str | None = None,
|
||||
metadata: dict[str, Any] | None = None,
|
||||
representation_id: str | None = None,
|
||||
) -> tuple[AssetRepresentation, BlobRef, bool]:
|
||||
data = content.encode("utf-8") if isinstance(content, str) else bytes(content)
|
||||
write = self.blob_storage.put_bytes(data, media_type=media_type)
|
||||
blob = write.blob
|
||||
representation = AssetRepresentation(
|
||||
asset_id=asset_id,
|
||||
kind=RepresentationKind(kind),
|
||||
media_type=media_type,
|
||||
digest=blob.digest,
|
||||
size_bytes=blob.size_bytes,
|
||||
storage_ref=blob.storage_ref,
|
||||
producer=producer,
|
||||
source_ref_id=source_ref_id,
|
||||
metadata={"blob_adapter": blob.adapter, **dict(metadata or {})},
|
||||
representation_id=representation_id or new_id("repr"),
|
||||
)
|
||||
return representation, blob, write.created
|
||||
|
||||
def add_representation_from_bytes(
|
||||
self,
|
||||
asset_id: str,
|
||||
kind: RepresentationKind | str,
|
||||
media_type: str,
|
||||
content: str | bytes,
|
||||
context: OperationContext,
|
||||
*,
|
||||
expected_current_version_id: str | None = None,
|
||||
producer: str | None = None,
|
||||
source_ref_id: str | None = None,
|
||||
metadata: dict[str, Any] | None = None,
|
||||
) -> AssetChangeResult:
|
||||
representation, _blob, _created = self.build_representation_from_bytes(
|
||||
asset_id,
|
||||
kind,
|
||||
media_type,
|
||||
content,
|
||||
producer=producer,
|
||||
source_ref_id=source_ref_id,
|
||||
metadata=metadata,
|
||||
)
|
||||
return self.asset_service.add_representation(
|
||||
asset_id,
|
||||
representation,
|
||||
context,
|
||||
expected_current_version_id=expected_current_version_id,
|
||||
)
|
||||
|
||||
def get_content_stream(
|
||||
self,
|
||||
asset_id: str,
|
||||
context: OperationContext,
|
||||
*,
|
||||
representation_id: str | None = None,
|
||||
kind: RepresentationKind | str | None = None,
|
||||
) -> RepresentationContentResult:
|
||||
asset = self.repository.get_asset(asset_id)
|
||||
representation = self._representation(asset_id, representation_id=representation_id, kind=kind)
|
||||
if not representation.storage_ref:
|
||||
raise NotFoundError(
|
||||
"Representation content is not available in blob storage",
|
||||
details={"asset_id": asset_id, "representation_id": representation.representation_id},
|
||||
)
|
||||
decision = self._authorize(
|
||||
context,
|
||||
"asset.content_stream.read",
|
||||
f"asset:{asset.id}",
|
||||
resource_metadata={
|
||||
"representation_id": representation.representation_id,
|
||||
"digest": representation.digest,
|
||||
"media_type": representation.media_type,
|
||||
},
|
||||
)
|
||||
try:
|
||||
blob = self.blob_storage.stat(representation.storage_ref)
|
||||
content = self.blob_storage.read_bytes(representation.storage_ref)
|
||||
except ValueError as exc:
|
||||
raise NotFoundError(
|
||||
"Representation content is not available in configured blob storage",
|
||||
details={
|
||||
"asset_id": asset_id,
|
||||
"representation_id": representation.representation_id,
|
||||
"storage_ref": representation.storage_ref,
|
||||
},
|
||||
) from exc
|
||||
if blob.digest != representation.digest:
|
||||
raise ValidationError(
|
||||
"Representation digest does not match stored blob",
|
||||
details={
|
||||
"representation_id": representation.representation_id,
|
||||
"representation_digest": representation.digest,
|
||||
"blob_digest": blob.digest,
|
||||
},
|
||||
)
|
||||
actual_digest = "sha256:" + hashlib.sha256(content).hexdigest()
|
||||
if actual_digest != representation.digest:
|
||||
raise ValidationError(
|
||||
"Representation content does not match expected digest",
|
||||
details={
|
||||
"representation_id": representation.representation_id,
|
||||
"representation_digest": representation.digest,
|
||||
"actual_digest": actual_digest,
|
||||
},
|
||||
)
|
||||
event = self._audit(
|
||||
"asset.content_stream.read",
|
||||
f"asset:{asset.id}",
|
||||
AuditOutcome.SUCCESS,
|
||||
context,
|
||||
decision,
|
||||
details={"representation_id": representation.representation_id, "digest": representation.digest},
|
||||
)
|
||||
return RepresentationContentResult(representation, content, blob, decision, event)
|
||||
|
||||
def stream_content(
|
||||
self,
|
||||
asset_id: str,
|
||||
context: OperationContext,
|
||||
*,
|
||||
representation_id: str | None = None,
|
||||
kind: RepresentationKind | str | None = None,
|
||||
chunk_size: int = 65536,
|
||||
) -> RepresentationContentStream:
|
||||
asset = self.repository.get_asset(asset_id)
|
||||
representation = self._representation(asset_id, representation_id=representation_id, kind=kind)
|
||||
if not representation.storage_ref:
|
||||
raise NotFoundError(
|
||||
"Representation content is not available in blob storage",
|
||||
details={"asset_id": asset_id, "representation_id": representation.representation_id},
|
||||
)
|
||||
decision = self._authorize(
|
||||
context,
|
||||
"asset.content_stream.read",
|
||||
f"asset:{asset.id}",
|
||||
resource_metadata={
|
||||
"representation_id": representation.representation_id,
|
||||
"digest": representation.digest,
|
||||
"media_type": representation.media_type,
|
||||
},
|
||||
)
|
||||
try:
|
||||
blob = self.blob_storage.stat(representation.storage_ref)
|
||||
except ValueError as exc:
|
||||
raise NotFoundError(
|
||||
"Representation content is not available in configured blob storage",
|
||||
details={
|
||||
"asset_id": asset_id,
|
||||
"representation_id": representation.representation_id,
|
||||
"storage_ref": representation.storage_ref,
|
||||
},
|
||||
) from exc
|
||||
if blob.digest != representation.digest:
|
||||
raise ValidationError(
|
||||
"Representation digest does not match stored blob",
|
||||
details={
|
||||
"representation_id": representation.representation_id,
|
||||
"representation_digest": representation.digest,
|
||||
"blob_digest": blob.digest,
|
||||
},
|
||||
)
|
||||
event = self._audit(
|
||||
"asset.content_stream.read",
|
||||
f"asset:{asset.id}",
|
||||
AuditOutcome.SUCCESS,
|
||||
context,
|
||||
decision,
|
||||
details={"representation_id": representation.representation_id, "digest": representation.digest},
|
||||
)
|
||||
return RepresentationContentStream(
|
||||
representation,
|
||||
self._verified_chunks(representation, chunk_size=chunk_size),
|
||||
blob,
|
||||
decision,
|
||||
event,
|
||||
)
|
||||
|
||||
def referenced_storage_refs(self) -> set[str]:
|
||||
return {
|
||||
representation.storage_ref
|
||||
for representation in self.repository.list_representations()
|
||||
if representation.storage_ref
|
||||
}
|
||||
|
||||
def cleanup_unreferenced_blobs(self, *, dry_run: bool = True) -> BlobCleanupResult:
|
||||
return self.blob_storage.delete_unreferenced(self.referenced_storage_refs(), dry_run=dry_run)
|
||||
|
||||
def _representation(
|
||||
self,
|
||||
asset_id: str,
|
||||
*,
|
||||
representation_id: str | None,
|
||||
kind: RepresentationKind | str | None,
|
||||
) -> AssetRepresentation:
|
||||
if representation_id:
|
||||
representation = self.repository.get_representation(representation_id)
|
||||
if representation.asset_id != asset_id:
|
||||
raise NotFoundError(
|
||||
"Representation not found for asset",
|
||||
details={"asset_id": asset_id, "representation_id": representation_id},
|
||||
)
|
||||
return representation
|
||||
parsed_kind = RepresentationKind(kind) if kind else None
|
||||
representations = self.repository.list_representations(asset_id=asset_id, kind=parsed_kind)
|
||||
if not representations:
|
||||
raise NotFoundError("Representation not found", details={"asset_id": asset_id, "kind": kind})
|
||||
priority = {
|
||||
RepresentationKind.SOURCE: 0,
|
||||
RepresentationKind.NORMALIZED: 1,
|
||||
RepresentationKind.DERIVED: 2,
|
||||
}
|
||||
best_priority = min(priority.get(item.kind, 99) for item in representations)
|
||||
candidates = [item for item in representations if priority.get(item.kind, 99) == best_priority]
|
||||
return sorted(candidates, key=lambda item: (item.created_at, item.representation_id), reverse=True)[0]
|
||||
|
||||
def _verified_chunks(
|
||||
self,
|
||||
representation: AssetRepresentation,
|
||||
*,
|
||||
chunk_size: int,
|
||||
) -> Iterator[bytes]:
|
||||
if not representation.storage_ref:
|
||||
raise NotFoundError(
|
||||
"Representation content is not available in blob storage",
|
||||
details={"asset_id": representation.asset_id, "representation_id": representation.representation_id},
|
||||
)
|
||||
hasher = hashlib.sha256()
|
||||
for chunk in self.blob_storage.iter_bytes(representation.storage_ref, chunk_size=chunk_size):
|
||||
hasher.update(chunk)
|
||||
yield chunk
|
||||
actual_digest = "sha256:" + hasher.hexdigest()
|
||||
if actual_digest != representation.digest:
|
||||
raise ValidationError(
|
||||
"Representation content does not match expected digest",
|
||||
details={
|
||||
"representation_id": representation.representation_id,
|
||||
"representation_digest": representation.digest,
|
||||
"actual_digest": actual_digest,
|
||||
},
|
||||
)
|
||||
|
||||
def _authorize(
|
||||
self,
|
||||
context: OperationContext,
|
||||
action: str,
|
||||
resource: str,
|
||||
*,
|
||||
resource_metadata: dict[str, str] | None = None,
|
||||
) -> PolicyDecision:
|
||||
self.repository.save_actor(context.actor)
|
||||
decision = self.policy_gateway.authorize(
|
||||
context,
|
||||
action,
|
||||
resource,
|
||||
resource_metadata=resource_metadata,
|
||||
)
|
||||
if not decision.allowed:
|
||||
event = self._audit(action, resource, AuditOutcome.DENIED, context, decision)
|
||||
raise AuthorizationError(
|
||||
"Operation denied by policy",
|
||||
details={
|
||||
"action": action,
|
||||
"resource": resource,
|
||||
"correlation_id": context.correlation_id,
|
||||
"audit_event_id": event.event_id,
|
||||
"policy_decision": decision.to_dict(),
|
||||
},
|
||||
)
|
||||
return decision
|
||||
|
||||
def _audit(
|
||||
self,
|
||||
operation: str,
|
||||
target: str,
|
||||
outcome: AuditOutcome,
|
||||
context: OperationContext,
|
||||
policy_decision: PolicyDecision,
|
||||
*,
|
||||
details: dict[str, Any] | None = None,
|
||||
) -> AuditEvent:
|
||||
return self.repository.save_audit_event(
|
||||
AuditEvent.from_context(
|
||||
operation,
|
||||
target,
|
||||
outcome,
|
||||
context,
|
||||
policy_decision=policy_decision,
|
||||
details=details,
|
||||
)
|
||||
)
|
||||
Reference in New Issue
Block a user