first ingestion/normalization slice

This commit is contained in:
2026-05-06 02:35:40 +02:00
parent 286ebc3cb6
commit 565a5643a3
19 changed files with 1231 additions and 10 deletions

View File

@@ -4,6 +4,16 @@ from .actors import Actor, ActorType, OperationContext
from .assets import AssetRepresentation, KnowledgeAsset, RepresentationKind
from .audit import AuditEvent, AuditOutcome
from .idempotency import IdempotencyRecord, IdempotencyStatus
from .ingestion import (
ConnectorCapability,
ExtractionResult,
ExtractorCapability,
IngestionFailure,
IngestionJob,
IngestionJobStatus,
NormalizedDocument,
SourcePayload,
)
from .metadata import Classification, LifecycleState, MetadataRecord, Sensitivity
from .policy import PolicyDecision, PolicyEffect
from .primitives import content_digest, mapping_digest, new_id, stable_json_dumps, utc_now
@@ -28,15 +38,22 @@ __all__ = [
"AuditEvent",
"AuditOutcome",
"Classification",
"ConnectorCapability",
"ContextEntity",
"ContextEntityType",
"CoreRelationship",
"DerivedArtifactLineage",
"ExtractionResult",
"ExtractorCapability",
"IdempotencyRecord",
"IdempotencyStatus",
"IngestionFailure",
"IngestionJob",
"IngestionJobStatus",
"KnowledgeAsset",
"LifecycleState",
"MetadataRecord",
"NormalizedDocument",
"OperationContext",
"PolicyDecision",
"PolicyEffect",
@@ -44,6 +61,7 @@ __all__ = [
"RepresentationKind",
"Sensitivity",
"SourceReference",
"SourcePayload",
"VersionChangeType",
"content_digest",
"mapping_digest",

View File

@@ -0,0 +1,308 @@
"""Ingestion job and normalized content primitives."""
from __future__ import annotations
from dataclasses import dataclass, field, replace
from enum import Enum
from typing import Any
from .primitives import compact_dict, content_digest, mapping_digest, new_id, stable_json_dumps, utc_now
from .provenance import SourceReference
class IngestionJobStatus(str, Enum):
QUEUED = "queued"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
PARTIALLY_COMPLETED = "partially_completed"
RETRIED = "retried"
QUARANTINED = "quarantined"
CANCELED = "canceled"
@dataclass(frozen=True)
class IngestionFailure:
code: str
message: str
retriable: bool = False
details: dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
return compact_dict(
{
"code": self.code,
"message": self.message,
"retriable": self.retriable,
"details": dict(self.details),
}
)
@classmethod
def from_dict(cls, data: dict[str, Any]) -> "IngestionFailure":
return cls(
code=data["code"],
message=data["message"],
retriable=bool(data.get("retriable", False)),
details=dict(data.get("details", {})),
)
@dataclass(frozen=True)
class ConnectorCapability:
connector_name: str
source_types: tuple[str, ...]
supports_directories: bool = False
metadata: dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
return compact_dict(
{
"connector_name": self.connector_name,
"source_types": list(self.source_types),
"supports_directories": self.supports_directories,
"metadata": dict(self.metadata),
}
)
@dataclass(frozen=True)
class ExtractorCapability:
extractor_name: str
media_types: tuple[str, ...]
extraction_depth: str = "text"
produces_structure: bool = False
optional_dependency: str | None = None
metadata: dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
return compact_dict(
{
"extractor_name": self.extractor_name,
"media_types": list(self.media_types),
"extraction_depth": self.extraction_depth,
"produces_structure": self.produces_structure,
"optional_dependency": self.optional_dependency,
"metadata": dict(self.metadata),
}
)
@dataclass(frozen=True)
class SourcePayload:
connector_name: str
source_uri: str
source_ref: SourceReference
media_type: str
content: bytes
title: str
metadata: dict[str, Any] = field(default_factory=dict)
permission_context: dict[str, Any] = field(default_factory=dict)
@property
def content_digest(self) -> str:
return content_digest(self.content)
@property
def size_bytes(self) -> int:
return len(self.content)
def read_text(self, encoding: str = "utf-8") -> str:
return self.content.decode(encoding)
@dataclass(frozen=True)
class NormalizedDocument:
text: str
media_type: str = "application/vnd.kontextual.normalized+json"
title: str | None = None
structure: dict[str, Any] = field(default_factory=dict)
tables: list[dict[str, Any]] = field(default_factory=list)
links: list[dict[str, Any]] = field(default_factory=list)
fields: dict[str, Any] = field(default_factory=dict)
confidence: float | None = None
unsupported_elements: list[dict[str, Any]] = field(default_factory=list)
extractor_metadata: dict[str, Any] = field(default_factory=dict)
@property
def normalized_hash(self) -> str:
return mapping_digest(self.to_dict(include_hash=False))
def to_dict(self, *, include_hash: bool = True) -> dict[str, Any]:
data = compact_dict(
{
"title": self.title,
"text": self.text,
"media_type": self.media_type,
"structure": dict(self.structure),
"tables": list(self.tables),
"links": list(self.links),
"fields": dict(self.fields),
"confidence": self.confidence,
"unsupported_elements": list(self.unsupported_elements),
"extractor_metadata": dict(self.extractor_metadata),
}
)
if include_hash:
data["normalized_hash"] = self.normalized_hash
return data
def to_json(self) -> str:
return stable_json_dumps(self.to_dict())
@dataclass(frozen=True)
class ExtractionResult:
normalized: NormalizedDocument
metadata: dict[str, Any] = field(default_factory=dict)
diagnostics: tuple[IngestionFailure, ...] = ()
def to_dict(self) -> dict[str, Any]:
return compact_dict(
{
"normalized": self.normalized.to_dict(),
"metadata": dict(self.metadata),
"diagnostics": [diagnostic.to_dict() for diagnostic in self.diagnostics],
}
)
@dataclass(frozen=True)
class IngestionJob:
input: dict[str, Any]
actor_id: str
correlation_id: str
status: IngestionJobStatus = IngestionJobStatus.QUEUED
source_ref: SourceReference | None = None
output_asset_ids: tuple[str, ...] = ()
failures: tuple[IngestionFailure, ...] = ()
partial_results: dict[str, Any] = field(default_factory=dict)
retry_options: dict[str, Any] = field(default_factory=dict)
retry_of_job_id: str | None = None
attempts: int = 1
metadata: dict[str, Any] = field(default_factory=dict)
job_id: str = field(default_factory=lambda: new_id("ingest"))
created_at: str = field(default_factory=lambda: utc_now().isoformat())
updated_at: str = field(default_factory=lambda: utc_now().isoformat())
completed_at: str | None = None
@classmethod
def create(
cls,
*,
input: dict[str, Any],
actor_id: str,
correlation_id: str,
retry_of_job_id: str | None = None,
metadata: dict[str, Any] | None = None,
) -> "IngestionJob":
return cls(
input=dict(input),
actor_id=actor_id,
correlation_id=correlation_id,
retry_of_job_id=retry_of_job_id,
metadata=dict(metadata or {}),
)
def running(self, *, source_ref: SourceReference | None = None) -> "IngestionJob":
return replace(
self,
status=IngestionJobStatus.RUNNING,
source_ref=source_ref or self.source_ref,
updated_at=utc_now().isoformat(),
)
def completed(
self,
*,
output_asset_ids: tuple[str, ...],
partial_results: dict[str, Any] | None = None,
) -> "IngestionJob":
now = utc_now().isoformat()
return replace(
self,
status=IngestionJobStatus.COMPLETED,
output_asset_ids=tuple(output_asset_ids),
partial_results=dict(partial_results or self.partial_results),
updated_at=now,
completed_at=now,
)
def failed(
self,
failure: IngestionFailure,
*,
status: IngestionJobStatus = IngestionJobStatus.FAILED,
partial_results: dict[str, Any] | None = None,
) -> "IngestionJob":
now = utc_now().isoformat()
return replace(
self,
status=status,
failures=self.failures + (failure,),
partial_results=dict(partial_results or self.partial_results),
updated_at=now,
completed_at=now,
)
def partially_completed(
self,
*,
output_asset_ids: tuple[str, ...],
failures: tuple[IngestionFailure, ...],
partial_results: dict[str, Any],
) -> "IngestionJob":
now = utc_now().isoformat()
return replace(
self,
status=IngestionJobStatus.PARTIALLY_COMPLETED,
output_asset_ids=tuple(output_asset_ids),
failures=tuple(failures),
partial_results=dict(partial_results),
updated_at=now,
completed_at=now,
)
def to_dict(self) -> dict[str, Any]:
return compact_dict(
{
"job_id": self.job_id,
"status": self.status.value,
"input": dict(self.input),
"actor_id": self.actor_id,
"correlation_id": self.correlation_id,
"source_ref": self.source_ref.to_dict() if self.source_ref else None,
"output_asset_ids": list(self.output_asset_ids),
"failures": [failure.to_dict() for failure in self.failures],
"partial_results": dict(self.partial_results),
"retry_options": dict(self.retry_options),
"retry_of_job_id": self.retry_of_job_id,
"attempts": self.attempts,
"metadata": dict(self.metadata),
"created_at": self.created_at,
"updated_at": self.updated_at,
"completed_at": self.completed_at,
}
)
@classmethod
def from_dict(cls, data: dict[str, Any]) -> "IngestionJob":
source_ref = data.get("source_ref")
return cls(
job_id=data["job_id"],
status=IngestionJobStatus(data["status"]),
input=dict(data.get("input", {})),
actor_id=data["actor_id"],
correlation_id=data["correlation_id"],
source_ref=SourceReference.from_dict(source_ref) if source_ref else None,
output_asset_ids=tuple(data.get("output_asset_ids", [])),
failures=tuple(IngestionFailure.from_dict(item) for item in data.get("failures", [])),
partial_results=dict(data.get("partial_results", {})),
retry_options=dict(data.get("retry_options", {})),
retry_of_job_id=data.get("retry_of_job_id"),
attempts=int(data.get("attempts", 1)),
metadata=dict(data.get("metadata", {})),
created_at=data["created_at"],
updated_at=data["updated_at"],
completed_at=data.get("completed_at"),
)