generated from coulomb/repo-seed
first ingestion/normalization slice
This commit is contained in:
@@ -4,6 +4,16 @@ from .actors import Actor, ActorType, OperationContext
|
||||
from .assets import AssetRepresentation, KnowledgeAsset, RepresentationKind
|
||||
from .audit import AuditEvent, AuditOutcome
|
||||
from .idempotency import IdempotencyRecord, IdempotencyStatus
|
||||
from .ingestion import (
|
||||
ConnectorCapability,
|
||||
ExtractionResult,
|
||||
ExtractorCapability,
|
||||
IngestionFailure,
|
||||
IngestionJob,
|
||||
IngestionJobStatus,
|
||||
NormalizedDocument,
|
||||
SourcePayload,
|
||||
)
|
||||
from .metadata import Classification, LifecycleState, MetadataRecord, Sensitivity
|
||||
from .policy import PolicyDecision, PolicyEffect
|
||||
from .primitives import content_digest, mapping_digest, new_id, stable_json_dumps, utc_now
|
||||
@@ -28,15 +38,22 @@ __all__ = [
|
||||
"AuditEvent",
|
||||
"AuditOutcome",
|
||||
"Classification",
|
||||
"ConnectorCapability",
|
||||
"ContextEntity",
|
||||
"ContextEntityType",
|
||||
"CoreRelationship",
|
||||
"DerivedArtifactLineage",
|
||||
"ExtractionResult",
|
||||
"ExtractorCapability",
|
||||
"IdempotencyRecord",
|
||||
"IdempotencyStatus",
|
||||
"IngestionFailure",
|
||||
"IngestionJob",
|
||||
"IngestionJobStatus",
|
||||
"KnowledgeAsset",
|
||||
"LifecycleState",
|
||||
"MetadataRecord",
|
||||
"NormalizedDocument",
|
||||
"OperationContext",
|
||||
"PolicyDecision",
|
||||
"PolicyEffect",
|
||||
@@ -44,6 +61,7 @@ __all__ = [
|
||||
"RepresentationKind",
|
||||
"Sensitivity",
|
||||
"SourceReference",
|
||||
"SourcePayload",
|
||||
"VersionChangeType",
|
||||
"content_digest",
|
||||
"mapping_digest",
|
||||
|
||||
308
src/kontextual_engine/core/ingestion.py
Normal file
308
src/kontextual_engine/core/ingestion.py
Normal file
@@ -0,0 +1,308 @@
|
||||
"""Ingestion job and normalized content primitives."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field, replace
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
|
||||
from .primitives import compact_dict, content_digest, mapping_digest, new_id, stable_json_dumps, utc_now
|
||||
from .provenance import SourceReference
|
||||
|
||||
|
||||
class IngestionJobStatus(str, Enum):
|
||||
QUEUED = "queued"
|
||||
RUNNING = "running"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
PARTIALLY_COMPLETED = "partially_completed"
|
||||
RETRIED = "retried"
|
||||
QUARANTINED = "quarantined"
|
||||
CANCELED = "canceled"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class IngestionFailure:
|
||||
code: str
|
||||
message: str
|
||||
retriable: bool = False
|
||||
details: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return compact_dict(
|
||||
{
|
||||
"code": self.code,
|
||||
"message": self.message,
|
||||
"retriable": self.retriable,
|
||||
"details": dict(self.details),
|
||||
}
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict[str, Any]) -> "IngestionFailure":
|
||||
return cls(
|
||||
code=data["code"],
|
||||
message=data["message"],
|
||||
retriable=bool(data.get("retriable", False)),
|
||||
details=dict(data.get("details", {})),
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ConnectorCapability:
|
||||
connector_name: str
|
||||
source_types: tuple[str, ...]
|
||||
supports_directories: bool = False
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return compact_dict(
|
||||
{
|
||||
"connector_name": self.connector_name,
|
||||
"source_types": list(self.source_types),
|
||||
"supports_directories": self.supports_directories,
|
||||
"metadata": dict(self.metadata),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ExtractorCapability:
|
||||
extractor_name: str
|
||||
media_types: tuple[str, ...]
|
||||
extraction_depth: str = "text"
|
||||
produces_structure: bool = False
|
||||
optional_dependency: str | None = None
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return compact_dict(
|
||||
{
|
||||
"extractor_name": self.extractor_name,
|
||||
"media_types": list(self.media_types),
|
||||
"extraction_depth": self.extraction_depth,
|
||||
"produces_structure": self.produces_structure,
|
||||
"optional_dependency": self.optional_dependency,
|
||||
"metadata": dict(self.metadata),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SourcePayload:
|
||||
connector_name: str
|
||||
source_uri: str
|
||||
source_ref: SourceReference
|
||||
media_type: str
|
||||
content: bytes
|
||||
title: str
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
permission_context: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
@property
|
||||
def content_digest(self) -> str:
|
||||
return content_digest(self.content)
|
||||
|
||||
@property
|
||||
def size_bytes(self) -> int:
|
||||
return len(self.content)
|
||||
|
||||
def read_text(self, encoding: str = "utf-8") -> str:
|
||||
return self.content.decode(encoding)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class NormalizedDocument:
|
||||
text: str
|
||||
media_type: str = "application/vnd.kontextual.normalized+json"
|
||||
title: str | None = None
|
||||
structure: dict[str, Any] = field(default_factory=dict)
|
||||
tables: list[dict[str, Any]] = field(default_factory=list)
|
||||
links: list[dict[str, Any]] = field(default_factory=list)
|
||||
fields: dict[str, Any] = field(default_factory=dict)
|
||||
confidence: float | None = None
|
||||
unsupported_elements: list[dict[str, Any]] = field(default_factory=list)
|
||||
extractor_metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
@property
|
||||
def normalized_hash(self) -> str:
|
||||
return mapping_digest(self.to_dict(include_hash=False))
|
||||
|
||||
def to_dict(self, *, include_hash: bool = True) -> dict[str, Any]:
|
||||
data = compact_dict(
|
||||
{
|
||||
"title": self.title,
|
||||
"text": self.text,
|
||||
"media_type": self.media_type,
|
||||
"structure": dict(self.structure),
|
||||
"tables": list(self.tables),
|
||||
"links": list(self.links),
|
||||
"fields": dict(self.fields),
|
||||
"confidence": self.confidence,
|
||||
"unsupported_elements": list(self.unsupported_elements),
|
||||
"extractor_metadata": dict(self.extractor_metadata),
|
||||
}
|
||||
)
|
||||
if include_hash:
|
||||
data["normalized_hash"] = self.normalized_hash
|
||||
return data
|
||||
|
||||
def to_json(self) -> str:
|
||||
return stable_json_dumps(self.to_dict())
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ExtractionResult:
|
||||
normalized: NormalizedDocument
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
diagnostics: tuple[IngestionFailure, ...] = ()
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return compact_dict(
|
||||
{
|
||||
"normalized": self.normalized.to_dict(),
|
||||
"metadata": dict(self.metadata),
|
||||
"diagnostics": [diagnostic.to_dict() for diagnostic in self.diagnostics],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class IngestionJob:
|
||||
input: dict[str, Any]
|
||||
actor_id: str
|
||||
correlation_id: str
|
||||
status: IngestionJobStatus = IngestionJobStatus.QUEUED
|
||||
source_ref: SourceReference | None = None
|
||||
output_asset_ids: tuple[str, ...] = ()
|
||||
failures: tuple[IngestionFailure, ...] = ()
|
||||
partial_results: dict[str, Any] = field(default_factory=dict)
|
||||
retry_options: dict[str, Any] = field(default_factory=dict)
|
||||
retry_of_job_id: str | None = None
|
||||
attempts: int = 1
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
job_id: str = field(default_factory=lambda: new_id("ingest"))
|
||||
created_at: str = field(default_factory=lambda: utc_now().isoformat())
|
||||
updated_at: str = field(default_factory=lambda: utc_now().isoformat())
|
||||
completed_at: str | None = None
|
||||
|
||||
@classmethod
|
||||
def create(
|
||||
cls,
|
||||
*,
|
||||
input: dict[str, Any],
|
||||
actor_id: str,
|
||||
correlation_id: str,
|
||||
retry_of_job_id: str | None = None,
|
||||
metadata: dict[str, Any] | None = None,
|
||||
) -> "IngestionJob":
|
||||
return cls(
|
||||
input=dict(input),
|
||||
actor_id=actor_id,
|
||||
correlation_id=correlation_id,
|
||||
retry_of_job_id=retry_of_job_id,
|
||||
metadata=dict(metadata or {}),
|
||||
)
|
||||
|
||||
def running(self, *, source_ref: SourceReference | None = None) -> "IngestionJob":
|
||||
return replace(
|
||||
self,
|
||||
status=IngestionJobStatus.RUNNING,
|
||||
source_ref=source_ref or self.source_ref,
|
||||
updated_at=utc_now().isoformat(),
|
||||
)
|
||||
|
||||
def completed(
|
||||
self,
|
||||
*,
|
||||
output_asset_ids: tuple[str, ...],
|
||||
partial_results: dict[str, Any] | None = None,
|
||||
) -> "IngestionJob":
|
||||
now = utc_now().isoformat()
|
||||
return replace(
|
||||
self,
|
||||
status=IngestionJobStatus.COMPLETED,
|
||||
output_asset_ids=tuple(output_asset_ids),
|
||||
partial_results=dict(partial_results or self.partial_results),
|
||||
updated_at=now,
|
||||
completed_at=now,
|
||||
)
|
||||
|
||||
def failed(
|
||||
self,
|
||||
failure: IngestionFailure,
|
||||
*,
|
||||
status: IngestionJobStatus = IngestionJobStatus.FAILED,
|
||||
partial_results: dict[str, Any] | None = None,
|
||||
) -> "IngestionJob":
|
||||
now = utc_now().isoformat()
|
||||
return replace(
|
||||
self,
|
||||
status=status,
|
||||
failures=self.failures + (failure,),
|
||||
partial_results=dict(partial_results or self.partial_results),
|
||||
updated_at=now,
|
||||
completed_at=now,
|
||||
)
|
||||
|
||||
def partially_completed(
|
||||
self,
|
||||
*,
|
||||
output_asset_ids: tuple[str, ...],
|
||||
failures: tuple[IngestionFailure, ...],
|
||||
partial_results: dict[str, Any],
|
||||
) -> "IngestionJob":
|
||||
now = utc_now().isoformat()
|
||||
return replace(
|
||||
self,
|
||||
status=IngestionJobStatus.PARTIALLY_COMPLETED,
|
||||
output_asset_ids=tuple(output_asset_ids),
|
||||
failures=tuple(failures),
|
||||
partial_results=dict(partial_results),
|
||||
updated_at=now,
|
||||
completed_at=now,
|
||||
)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return compact_dict(
|
||||
{
|
||||
"job_id": self.job_id,
|
||||
"status": self.status.value,
|
||||
"input": dict(self.input),
|
||||
"actor_id": self.actor_id,
|
||||
"correlation_id": self.correlation_id,
|
||||
"source_ref": self.source_ref.to_dict() if self.source_ref else None,
|
||||
"output_asset_ids": list(self.output_asset_ids),
|
||||
"failures": [failure.to_dict() for failure in self.failures],
|
||||
"partial_results": dict(self.partial_results),
|
||||
"retry_options": dict(self.retry_options),
|
||||
"retry_of_job_id": self.retry_of_job_id,
|
||||
"attempts": self.attempts,
|
||||
"metadata": dict(self.metadata),
|
||||
"created_at": self.created_at,
|
||||
"updated_at": self.updated_at,
|
||||
"completed_at": self.completed_at,
|
||||
}
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict[str, Any]) -> "IngestionJob":
|
||||
source_ref = data.get("source_ref")
|
||||
return cls(
|
||||
job_id=data["job_id"],
|
||||
status=IngestionJobStatus(data["status"]),
|
||||
input=dict(data.get("input", {})),
|
||||
actor_id=data["actor_id"],
|
||||
correlation_id=data["correlation_id"],
|
||||
source_ref=SourceReference.from_dict(source_ref) if source_ref else None,
|
||||
output_asset_ids=tuple(data.get("output_asset_ids", [])),
|
||||
failures=tuple(IngestionFailure.from_dict(item) for item in data.get("failures", [])),
|
||||
partial_results=dict(data.get("partial_results", {})),
|
||||
retry_options=dict(data.get("retry_options", {})),
|
||||
retry_of_job_id=data.get("retry_of_job_id"),
|
||||
attempts=int(data.get("attempts", 1)),
|
||||
metadata=dict(data.get("metadata", {})),
|
||||
created_at=data["created_at"],
|
||||
updated_at=data["updated_at"],
|
||||
completed_at=data.get("completed_at"),
|
||||
)
|
||||
Reference in New Issue
Block a user