generated from coulomb/repo-seed
1089 lines
45 KiB
Python
1089 lines
45 KiB
Python
"""SQLite asset registry repository."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import sqlite3
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from kontextual_engine.core import (
|
|
Actor,
|
|
AssetRepresentation,
|
|
AssetVersion,
|
|
AuditEvent,
|
|
ContextEntity,
|
|
CoreRelationship,
|
|
DerivedArtifactLineage,
|
|
IdempotencyRecord,
|
|
IngestionJob,
|
|
IngestionJobStatus,
|
|
KnowledgeAsset,
|
|
LifecycleState,
|
|
MetadataRecord,
|
|
MetadataSchema,
|
|
MetadataSchemaAssignment,
|
|
RepresentationKind,
|
|
RelationshipTargetKind,
|
|
RetrievalFeedbackRecord,
|
|
Sensitivity,
|
|
TransformationRun,
|
|
TransformationRunStatus,
|
|
WorkflowRun,
|
|
WorkflowRunStatus,
|
|
WorkflowTemplate,
|
|
)
|
|
from kontextual_engine.errors import NotFoundError, ValidationError
|
|
|
|
|
|
class SQLiteAssetRegistryRepository:
|
|
def __init__(self, path: str | Path) -> None:
|
|
self.path = Path(path)
|
|
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
self._initialize()
|
|
|
|
def save_actor(self, actor: Actor) -> Actor:
|
|
with self._connect() as conn:
|
|
conn.execute(
|
|
"""
|
|
insert into actors (id, actor_type, payload)
|
|
values (?, ?, ?)
|
|
on conflict(id) do update set
|
|
actor_type=excluded.actor_type,
|
|
payload=excluded.payload
|
|
""",
|
|
(actor.id, actor.actor_type.value, _json(actor.to_dict())),
|
|
)
|
|
return actor
|
|
|
|
def get_actor(self, actor_id: str) -> Actor:
|
|
row = self._one("select payload from actors where id = ?", (actor_id,))
|
|
if row is None:
|
|
raise NotFoundError("Actor not found", details={"actor_id": actor_id})
|
|
return Actor.from_dict(_loads(row["payload"]))
|
|
|
|
def save_asset(self, asset: KnowledgeAsset) -> KnowledgeAsset:
|
|
with self._connect() as conn:
|
|
conn.execute(
|
|
"""
|
|
insert into assets (id, title, asset_type, lifecycle, payload)
|
|
values (?, ?, ?, ?, ?)
|
|
on conflict(id) do update set
|
|
title=excluded.title,
|
|
asset_type=excluded.asset_type,
|
|
lifecycle=excluded.lifecycle,
|
|
payload=excluded.payload
|
|
""",
|
|
(
|
|
asset.id,
|
|
asset.title,
|
|
asset.classification.asset_type,
|
|
asset.lifecycle.value,
|
|
_json(asset.to_dict()),
|
|
),
|
|
)
|
|
return asset
|
|
|
|
def get_asset(self, asset_id: str) -> KnowledgeAsset:
|
|
row = self._one("select payload from assets where id = ?", (asset_id,))
|
|
if row is None:
|
|
raise NotFoundError("Asset not found", details={"asset_id": asset_id})
|
|
return KnowledgeAsset.from_dict(_loads(row["payload"]))
|
|
|
|
def list_assets(
|
|
self,
|
|
*,
|
|
lifecycle: LifecycleState | None = None,
|
|
asset_type: str | None = None,
|
|
sensitivity: Sensitivity | str | None = None,
|
|
owner: str | None = None,
|
|
topic: str | None = None,
|
|
review_state: str | None = None,
|
|
metadata_filters: dict[str, Any] | None = None,
|
|
confirmed_metadata_only: bool = False,
|
|
) -> list[KnowledgeAsset]:
|
|
clauses = []
|
|
params: list[Any] = []
|
|
if lifecycle is not None:
|
|
clauses.append("lifecycle = ?")
|
|
params.append(lifecycle.value)
|
|
if asset_type is not None:
|
|
clauses.append("asset_type = ?")
|
|
params.append(asset_type)
|
|
where = f" where {' and '.join(clauses)}" if clauses else ""
|
|
rows = self._all(f"select payload from assets{where} order by title, id", tuple(params))
|
|
assets = [KnowledgeAsset.from_dict(_loads(row["payload"])) for row in rows]
|
|
if sensitivity is not None:
|
|
sensitivity = Sensitivity(sensitivity)
|
|
assets = [asset for asset in assets if asset.classification.sensitivity == sensitivity]
|
|
if owner is not None:
|
|
assets = [asset for asset in assets if asset.classification.owner == owner]
|
|
if topic is not None:
|
|
assets = [asset for asset in assets if topic in asset.classification.topics]
|
|
if review_state is not None:
|
|
assets = [asset for asset in assets if asset.classification.review_state == review_state]
|
|
if metadata_filters:
|
|
records_by_asset = self._metadata_records_for_assets([asset.id for asset in assets])
|
|
assets = [
|
|
asset
|
|
for asset in assets
|
|
if _metadata_matches(
|
|
records_by_asset.get(asset.id, []),
|
|
metadata_filters,
|
|
confirmed_metadata_only=confirmed_metadata_only,
|
|
)
|
|
]
|
|
return assets
|
|
|
|
def save_representation(self, representation: AssetRepresentation) -> AssetRepresentation:
|
|
try:
|
|
with self._connect() as conn:
|
|
conn.execute(
|
|
"""
|
|
insert into representations (id, asset_id, kind, digest, payload)
|
|
values (?, ?, ?, ?, ?)
|
|
on conflict(id) do update set
|
|
asset_id=excluded.asset_id,
|
|
kind=excluded.kind,
|
|
digest=excluded.digest,
|
|
payload=excluded.payload
|
|
""",
|
|
(
|
|
representation.representation_id,
|
|
representation.asset_id,
|
|
representation.kind.value,
|
|
representation.digest,
|
|
_json(representation.to_dict()),
|
|
),
|
|
)
|
|
except sqlite3.IntegrityError as exc:
|
|
raise ValidationError(
|
|
"Representation references an unknown asset",
|
|
details={
|
|
"asset_id": representation.asset_id,
|
|
"representation_id": representation.representation_id,
|
|
},
|
|
) from exc
|
|
return representation
|
|
|
|
def get_representation(self, representation_id: str) -> AssetRepresentation:
|
|
row = self._one("select payload from representations where id = ?", (representation_id,))
|
|
if row is None:
|
|
raise NotFoundError(
|
|
"Representation not found",
|
|
details={"representation_id": representation_id},
|
|
)
|
|
return AssetRepresentation.from_dict(_loads(row["payload"]))
|
|
|
|
def list_representations(
|
|
self,
|
|
*,
|
|
asset_id: str | None = None,
|
|
kind: RepresentationKind | None = None,
|
|
) -> list[AssetRepresentation]:
|
|
clauses = []
|
|
params: list[Any] = []
|
|
if asset_id is not None:
|
|
clauses.append("asset_id = ?")
|
|
params.append(asset_id)
|
|
if kind is not None:
|
|
clauses.append("kind = ?")
|
|
params.append(kind.value)
|
|
where = f" where {' and '.join(clauses)}" if clauses else ""
|
|
rows = self._all(
|
|
f"select payload from representations{where} order by asset_id, kind, id",
|
|
tuple(params),
|
|
)
|
|
return [AssetRepresentation.from_dict(_loads(row["payload"])) for row in rows]
|
|
|
|
def save_metadata_record(self, asset_id: str, record: MetadataRecord) -> MetadataRecord:
|
|
try:
|
|
with self._connect() as conn:
|
|
conn.execute(
|
|
"""
|
|
insert into metadata_records (id, asset_id, key, payload)
|
|
values (?, ?, ?, ?)
|
|
on conflict(id) do update set
|
|
asset_id=excluded.asset_id,
|
|
key=excluded.key,
|
|
payload=excluded.payload
|
|
""",
|
|
(record.record_id, asset_id, record.key, _json(record.to_dict())),
|
|
)
|
|
except sqlite3.IntegrityError as exc:
|
|
raise ValidationError(
|
|
"Metadata record references an unknown asset",
|
|
details={"asset_id": asset_id, "record_id": record.record_id},
|
|
) from exc
|
|
return record
|
|
|
|
def list_metadata_records(self, asset_id: str) -> list[MetadataRecord]:
|
|
rows = self._all(
|
|
"select payload from metadata_records where asset_id = ? order by key, id",
|
|
(asset_id,),
|
|
)
|
|
if not rows:
|
|
self.get_asset(asset_id)
|
|
return [MetadataRecord.from_dict(_loads(row["payload"])) for row in rows]
|
|
|
|
def save_metadata_schema(self, schema: MetadataSchema) -> MetadataSchema:
|
|
with self._connect() as conn:
|
|
conn.execute(
|
|
"""
|
|
insert into metadata_schemas (id, name, version, payload)
|
|
values (?, ?, ?, ?)
|
|
on conflict(id) do update set
|
|
name=excluded.name,
|
|
version=excluded.version,
|
|
payload=excluded.payload
|
|
""",
|
|
(schema.schema_id, schema.name, schema.version, _json(schema.to_dict())),
|
|
)
|
|
return schema
|
|
|
|
def get_metadata_schema(self, schema_id: str) -> MetadataSchema:
|
|
row = self._one("select payload from metadata_schemas where id = ?", (schema_id,))
|
|
if row is None:
|
|
raise NotFoundError("Metadata schema not found", details={"schema_id": schema_id})
|
|
return MetadataSchema.from_dict(_loads(row["payload"]))
|
|
|
|
def list_metadata_schemas(self) -> list[MetadataSchema]:
|
|
rows = self._all("select payload from metadata_schemas order by name, id", ())
|
|
return [MetadataSchema.from_dict(_loads(row["payload"])) for row in rows]
|
|
|
|
def save_metadata_schema_assignment(
|
|
self,
|
|
assignment: MetadataSchemaAssignment,
|
|
) -> MetadataSchemaAssignment:
|
|
self.get_metadata_schema(assignment.schema_id)
|
|
with self._connect() as conn:
|
|
conn.execute(
|
|
"""
|
|
insert into metadata_schema_assignments (id, schema_id, priority, payload)
|
|
values (?, ?, ?, ?)
|
|
on conflict(id) do update set
|
|
schema_id=excluded.schema_id,
|
|
priority=excluded.priority,
|
|
payload=excluded.payload
|
|
""",
|
|
(
|
|
assignment.assignment_id,
|
|
assignment.schema_id,
|
|
assignment.priority,
|
|
_json(assignment.to_dict()),
|
|
),
|
|
)
|
|
return assignment
|
|
|
|
def get_metadata_schema_assignment(self, assignment_id: str) -> MetadataSchemaAssignment:
|
|
row = self._one(
|
|
"select payload from metadata_schema_assignments where id = ?",
|
|
(assignment_id,),
|
|
)
|
|
if row is None:
|
|
raise NotFoundError(
|
|
"Metadata schema assignment not found",
|
|
details={"assignment_id": assignment_id},
|
|
)
|
|
return MetadataSchemaAssignment.from_dict(_loads(row["payload"]))
|
|
|
|
def list_metadata_schema_assignments(self) -> list[MetadataSchemaAssignment]:
|
|
rows = self._all(
|
|
"select payload from metadata_schema_assignments order by priority, schema_id, id",
|
|
(),
|
|
)
|
|
return [MetadataSchemaAssignment.from_dict(_loads(row["payload"])) for row in rows]
|
|
|
|
def save_context_entity(self, entity: ContextEntity) -> ContextEntity:
|
|
with self._connect() as conn:
|
|
conn.execute(
|
|
"""
|
|
insert into context_entities (id, entity_type, name, payload)
|
|
values (?, ?, ?, ?)
|
|
on conflict(id) do update set
|
|
entity_type=excluded.entity_type,
|
|
name=excluded.name,
|
|
payload=excluded.payload
|
|
""",
|
|
(entity.entity_id, entity.entity_type.value, entity.name, _json(entity.to_dict())),
|
|
)
|
|
return entity
|
|
|
|
def get_context_entity(self, entity_id: str) -> ContextEntity:
|
|
row = self._one("select payload from context_entities where id = ?", (entity_id,))
|
|
if row is None:
|
|
raise NotFoundError("Context entity not found", details={"entity_id": entity_id})
|
|
return ContextEntity.from_dict(_loads(row["payload"]))
|
|
|
|
def list_context_entities(self) -> list[ContextEntity]:
|
|
rows = self._all("select payload from context_entities order by entity_type, name, id", ())
|
|
return [ContextEntity.from_dict(_loads(row["payload"])) for row in rows]
|
|
|
|
def save_relationship(self, relationship: CoreRelationship) -> CoreRelationship:
|
|
self.get_asset(relationship.source_id)
|
|
if relationship.target_kind == RelationshipTargetKind.ASSET:
|
|
self.get_asset(relationship.target_id)
|
|
else:
|
|
self.get_context_entity(relationship.target_id)
|
|
with self._connect() as conn:
|
|
conn.execute(
|
|
"""
|
|
insert into core_relationships (id, source_id, target_id, target_kind, predicate, payload)
|
|
values (?, ?, ?, ?, ?, ?)
|
|
on conflict(id) do update set
|
|
source_id=excluded.source_id,
|
|
target_id=excluded.target_id,
|
|
target_kind=excluded.target_kind,
|
|
predicate=excluded.predicate,
|
|
payload=excluded.payload
|
|
""",
|
|
(
|
|
relationship.relationship_id,
|
|
relationship.source_id,
|
|
relationship.target_id,
|
|
relationship.target_kind.value,
|
|
relationship.predicate,
|
|
_json(relationship.to_dict()),
|
|
),
|
|
)
|
|
return relationship
|
|
|
|
def get_relationship(self, relationship_id: str) -> CoreRelationship:
|
|
row = self._one("select payload from core_relationships where id = ?", (relationship_id,))
|
|
if row is None:
|
|
raise NotFoundError(
|
|
"Relationship not found",
|
|
details={"relationship_id": relationship_id},
|
|
)
|
|
return CoreRelationship.from_dict(_loads(row["payload"]))
|
|
|
|
def list_relationships(
|
|
self,
|
|
*,
|
|
source_id: str | None = None,
|
|
target_id: str | None = None,
|
|
) -> list[CoreRelationship]:
|
|
clauses = []
|
|
params: list[Any] = []
|
|
if source_id is not None:
|
|
clauses.append("source_id = ?")
|
|
params.append(source_id)
|
|
if target_id is not None:
|
|
clauses.append("target_id = ?")
|
|
params.append(target_id)
|
|
where = f" where {' and '.join(clauses)}" if clauses else ""
|
|
rows = self._all(
|
|
f"select payload from core_relationships{where} order by source_id, target_id, predicate, id",
|
|
tuple(params),
|
|
)
|
|
return [CoreRelationship.from_dict(_loads(row["payload"])) for row in rows]
|
|
|
|
def save_version(self, version: AssetVersion) -> AssetVersion:
|
|
try:
|
|
with self._connect() as conn:
|
|
conn.execute(
|
|
"""
|
|
insert into asset_versions (id, asset_id, sequence, change_type, payload)
|
|
values (?, ?, ?, ?, ?)
|
|
""",
|
|
(
|
|
version.version_id,
|
|
version.asset_id,
|
|
version.sequence,
|
|
version.change_type.value,
|
|
_json(version.to_dict()),
|
|
),
|
|
)
|
|
except sqlite3.IntegrityError as exc:
|
|
if _is_foreign_key_error(exc):
|
|
raise ValidationError(
|
|
"Version references an unknown asset",
|
|
details={"asset_id": version.asset_id, "version_id": version.version_id},
|
|
) from exc
|
|
raise ValidationError(
|
|
"Version sequence already exists for asset",
|
|
details={"asset_id": version.asset_id, "sequence": version.sequence},
|
|
) from exc
|
|
return version
|
|
|
|
def list_versions(self, asset_id: str) -> list[AssetVersion]:
|
|
rows = self._all(
|
|
"select payload from asset_versions where asset_id = ? order by sequence",
|
|
(asset_id,),
|
|
)
|
|
if not rows:
|
|
self.get_asset(asset_id)
|
|
return [AssetVersion.from_dict(_loads(row["payload"])) for row in rows]
|
|
|
|
def save_audit_event(self, event: AuditEvent) -> AuditEvent:
|
|
try:
|
|
with self._connect() as conn:
|
|
conn.execute(
|
|
"""
|
|
insert into audit_events (id, target, actor_id, correlation_id, outcome, occurred_at, payload)
|
|
values (?, ?, ?, ?, ?, ?, ?)
|
|
on conflict(id) do update set
|
|
target=excluded.target,
|
|
actor_id=excluded.actor_id,
|
|
correlation_id=excluded.correlation_id,
|
|
outcome=excluded.outcome,
|
|
occurred_at=excluded.occurred_at,
|
|
payload=excluded.payload
|
|
""",
|
|
(
|
|
event.event_id,
|
|
event.target,
|
|
event.actor_id,
|
|
event.correlation_id,
|
|
event.outcome.value,
|
|
event.occurred_at,
|
|
_json(event.to_dict()),
|
|
),
|
|
)
|
|
except sqlite3.IntegrityError as exc:
|
|
if _is_foreign_key_error(exc):
|
|
raise ValidationError(
|
|
"Audit event references an unknown actor",
|
|
details={"actor_id": event.actor_id, "event_id": event.event_id},
|
|
) from exc
|
|
raise
|
|
return event
|
|
|
|
def get_audit_event(self, event_id: str) -> AuditEvent:
|
|
row = self._one("select payload from audit_events where id = ?", (event_id,))
|
|
if row is None:
|
|
raise NotFoundError("Audit event not found", details={"event_id": event_id})
|
|
return AuditEvent.from_dict(_loads(row["payload"]))
|
|
|
|
def list_audit_events(
|
|
self,
|
|
*,
|
|
target: str | None = None,
|
|
correlation_id: str | None = None,
|
|
) -> list[AuditEvent]:
|
|
clauses = []
|
|
params: list[Any] = []
|
|
if target is not None:
|
|
clauses.append("target = ?")
|
|
params.append(target)
|
|
if correlation_id is not None:
|
|
clauses.append("correlation_id = ?")
|
|
params.append(correlation_id)
|
|
where = f" where {' and '.join(clauses)}" if clauses else ""
|
|
rows = self._all(f"select payload from audit_events{where} order by occurred_at, rowid", tuple(params))
|
|
return [AuditEvent.from_dict(_loads(row["payload"])) for row in rows]
|
|
|
|
def save_retrieval_feedback(self, record: RetrievalFeedbackRecord) -> RetrievalFeedbackRecord:
|
|
try:
|
|
with self._connect() as conn:
|
|
conn.execute(
|
|
"""
|
|
insert into retrieval_feedback (id, label, actor_id, correlation_id, created_at, payload)
|
|
values (?, ?, ?, ?, ?, ?)
|
|
on conflict(id) do update set
|
|
label=excluded.label,
|
|
actor_id=excluded.actor_id,
|
|
correlation_id=excluded.correlation_id,
|
|
created_at=excluded.created_at,
|
|
payload=excluded.payload
|
|
""",
|
|
(
|
|
record.feedback_id,
|
|
record.label.value,
|
|
record.actor_id,
|
|
record.correlation_id,
|
|
record.created_at,
|
|
_json(record.to_dict()),
|
|
),
|
|
)
|
|
except sqlite3.IntegrityError as exc:
|
|
if _is_foreign_key_error(exc):
|
|
raise ValidationError(
|
|
"Retrieval feedback references an unknown actor",
|
|
details={"actor_id": record.actor_id, "feedback_id": record.feedback_id},
|
|
) from exc
|
|
raise
|
|
return record
|
|
|
|
def list_retrieval_feedback(
|
|
self,
|
|
*,
|
|
correlation_id: str | None = None,
|
|
label: str | None = None,
|
|
) -> list[RetrievalFeedbackRecord]:
|
|
clauses = []
|
|
params: list[Any] = []
|
|
if correlation_id is not None:
|
|
clauses.append("correlation_id = ?")
|
|
params.append(correlation_id)
|
|
if label is not None:
|
|
clauses.append("label = ?")
|
|
params.append(label)
|
|
where = f" where {' and '.join(clauses)}" if clauses else ""
|
|
rows = self._all(f"select payload from retrieval_feedback{where} order by created_at, id", tuple(params))
|
|
return [RetrievalFeedbackRecord.from_dict(_loads(row["payload"])) for row in rows]
|
|
|
|
def save_idempotency_record(self, record: IdempotencyRecord) -> IdempotencyRecord:
|
|
with self._connect() as conn:
|
|
conn.execute(
|
|
"""
|
|
insert into idempotency_records (key, operation, request_hash, status, payload)
|
|
values (?, ?, ?, ?, ?)
|
|
on conflict(key) do update set
|
|
operation=excluded.operation,
|
|
request_hash=excluded.request_hash,
|
|
status=excluded.status,
|
|
payload=excluded.payload
|
|
""",
|
|
(
|
|
record.key,
|
|
record.operation,
|
|
record.request_hash,
|
|
record.status.value,
|
|
_json(record.to_dict()),
|
|
),
|
|
)
|
|
return record
|
|
|
|
def get_idempotency_record(self, key: str) -> IdempotencyRecord | None:
|
|
row = self._one("select payload from idempotency_records where key = ?", (key,))
|
|
if row is None:
|
|
return None
|
|
return IdempotencyRecord.from_dict(_loads(row["payload"]))
|
|
|
|
def save_ingestion_job(self, job: IngestionJob) -> IngestionJob:
|
|
try:
|
|
with self._connect() as conn:
|
|
conn.execute(
|
|
"""
|
|
insert into ingestion_jobs (id, status, actor_id, correlation_id, created_at, updated_at, payload)
|
|
values (?, ?, ?, ?, ?, ?, ?)
|
|
on conflict(id) do update set
|
|
status=excluded.status,
|
|
actor_id=excluded.actor_id,
|
|
correlation_id=excluded.correlation_id,
|
|
updated_at=excluded.updated_at,
|
|
payload=excluded.payload
|
|
""",
|
|
(
|
|
job.job_id,
|
|
job.status.value,
|
|
job.actor_id,
|
|
job.correlation_id,
|
|
job.created_at,
|
|
job.updated_at,
|
|
_json(job.to_dict()),
|
|
),
|
|
)
|
|
except sqlite3.IntegrityError as exc:
|
|
if _is_foreign_key_error(exc):
|
|
raise ValidationError(
|
|
"Ingestion job references an unknown actor",
|
|
details={"actor_id": job.actor_id, "job_id": job.job_id},
|
|
) from exc
|
|
raise
|
|
return job
|
|
|
|
def get_ingestion_job(self, job_id: str) -> IngestionJob:
|
|
row = self._one("select payload from ingestion_jobs where id = ?", (job_id,))
|
|
if row is None:
|
|
raise NotFoundError("Ingestion job not found", details={"job_id": job_id})
|
|
return IngestionJob.from_dict(_loads(row["payload"]))
|
|
|
|
def list_ingestion_jobs(
|
|
self,
|
|
*,
|
|
status: IngestionJobStatus | None = None,
|
|
) -> list[IngestionJob]:
|
|
if status is None:
|
|
rows = self._all("select payload from ingestion_jobs order by created_at, id", ())
|
|
else:
|
|
rows = self._all(
|
|
"select payload from ingestion_jobs where status = ? order by created_at, id",
|
|
(status.value,),
|
|
)
|
|
return [IngestionJob.from_dict(_loads(row["payload"])) for row in rows]
|
|
|
|
def save_transformation_run(self, run: TransformationRun) -> TransformationRun:
|
|
try:
|
|
with self._connect() as conn:
|
|
conn.execute(
|
|
"""
|
|
insert into transformation_runs
|
|
(id, operation_id, status, actor_id, correlation_id, queued_at, updated_at, payload)
|
|
values (?, ?, ?, ?, ?, ?, ?, ?)
|
|
on conflict(id) do update set
|
|
operation_id=excluded.operation_id,
|
|
status=excluded.status,
|
|
actor_id=excluded.actor_id,
|
|
correlation_id=excluded.correlation_id,
|
|
queued_at=excluded.queued_at,
|
|
updated_at=excluded.updated_at,
|
|
payload=excluded.payload
|
|
""",
|
|
(
|
|
run.run_id,
|
|
run.operation_id,
|
|
run.status.value,
|
|
run.actor_id,
|
|
run.correlation_id,
|
|
run.queued_at,
|
|
run.updated_at,
|
|
_json(run.to_dict()),
|
|
),
|
|
)
|
|
except sqlite3.IntegrityError as exc:
|
|
if _is_foreign_key_error(exc):
|
|
raise ValidationError(
|
|
"Transformation run references an unknown actor",
|
|
details={"actor_id": run.actor_id, "run_id": run.run_id},
|
|
) from exc
|
|
raise
|
|
return run
|
|
|
|
def get_transformation_run(self, run_id: str) -> TransformationRun:
|
|
row = self._one("select payload from transformation_runs where id = ?", (run_id,))
|
|
if row is None:
|
|
raise NotFoundError("Transformation run not found", details={"run_id": run_id})
|
|
return TransformationRun.from_dict(_loads(row["payload"]))
|
|
|
|
def list_transformation_runs(
|
|
self,
|
|
*,
|
|
status: TransformationRunStatus | None = None,
|
|
operation_id: str | None = None,
|
|
) -> list[TransformationRun]:
|
|
clauses = []
|
|
params: list[Any] = []
|
|
if status is not None:
|
|
clauses.append("status = ?")
|
|
params.append(status.value)
|
|
if operation_id is not None:
|
|
clauses.append("operation_id = ?")
|
|
params.append(operation_id)
|
|
where = f" where {' and '.join(clauses)}" if clauses else ""
|
|
rows = self._all(f"select payload from transformation_runs{where} order by queued_at, id", tuple(params))
|
|
return [TransformationRun.from_dict(_loads(row["payload"])) for row in rows]
|
|
|
|
def save_derived_lineage(self, lineage: DerivedArtifactLineage) -> DerivedArtifactLineage:
|
|
try:
|
|
with self._connect() as conn:
|
|
conn.execute(
|
|
"""
|
|
insert into derived_lineage
|
|
(id, output_asset_id, transformation_run_id, payload)
|
|
values (?, ?, ?, ?)
|
|
on conflict(id) do update set
|
|
output_asset_id=excluded.output_asset_id,
|
|
transformation_run_id=excluded.transformation_run_id,
|
|
payload=excluded.payload
|
|
""",
|
|
(
|
|
lineage.lineage_id,
|
|
lineage.output_asset_id,
|
|
lineage.transformation_run_id,
|
|
_json(lineage.to_dict()),
|
|
),
|
|
)
|
|
except sqlite3.IntegrityError as exc:
|
|
if _is_foreign_key_error(exc):
|
|
raise ValidationError(
|
|
"Derived lineage references an unknown output asset or transformation run",
|
|
details={
|
|
"output_asset_id": lineage.output_asset_id,
|
|
"transformation_run_id": lineage.transformation_run_id,
|
|
"lineage_id": lineage.lineage_id,
|
|
},
|
|
) from exc
|
|
raise
|
|
return lineage
|
|
|
|
def get_derived_lineage(self, lineage_id: str) -> DerivedArtifactLineage:
|
|
row = self._one("select payload from derived_lineage where id = ?", (lineage_id,))
|
|
if row is None:
|
|
raise NotFoundError("Derived lineage not found", details={"lineage_id": lineage_id})
|
|
return DerivedArtifactLineage.from_dict(_loads(row["payload"]))
|
|
|
|
def list_derived_lineage(
|
|
self,
|
|
*,
|
|
output_asset_id: str | None = None,
|
|
source_asset_id: str | None = None,
|
|
transformation_run_id: str | None = None,
|
|
) -> list[DerivedArtifactLineage]:
|
|
clauses = []
|
|
params: list[Any] = []
|
|
if output_asset_id is not None:
|
|
clauses.append("output_asset_id = ?")
|
|
params.append(output_asset_id)
|
|
if transformation_run_id is not None:
|
|
clauses.append("transformation_run_id = ?")
|
|
params.append(transformation_run_id)
|
|
where = f" where {' and '.join(clauses)}" if clauses else ""
|
|
rows = self._all(f"select payload from derived_lineage{where} order by transformation_run_id, id", tuple(params))
|
|
records = [DerivedArtifactLineage.from_dict(_loads(row["payload"])) for row in rows]
|
|
if source_asset_id is not None:
|
|
records = [record for record in records if source_asset_id in record.source_asset_ids]
|
|
return records
|
|
|
|
def save_workflow_template(self, template: WorkflowTemplate) -> WorkflowTemplate:
|
|
with self._connect() as conn:
|
|
conn.execute(
|
|
"""
|
|
insert into workflow_templates
|
|
(id, version, name, updated_at, payload)
|
|
values (?, ?, ?, ?, ?)
|
|
on conflict(id, version) do update set
|
|
name=excluded.name,
|
|
updated_at=excluded.updated_at,
|
|
payload=excluded.payload
|
|
""",
|
|
(
|
|
template.template_id,
|
|
template.version,
|
|
template.name,
|
|
template.updated_at,
|
|
_json(template.to_dict()),
|
|
),
|
|
)
|
|
return template
|
|
|
|
def get_workflow_template(
|
|
self,
|
|
template_id: str,
|
|
*,
|
|
version: str | None = None,
|
|
) -> WorkflowTemplate:
|
|
if version is None:
|
|
row = self._one(
|
|
"""
|
|
select payload from workflow_templates
|
|
where id = ?
|
|
order by updated_at desc, version desc
|
|
limit 1
|
|
""",
|
|
(template_id,),
|
|
)
|
|
else:
|
|
row = self._one(
|
|
"select payload from workflow_templates where id = ? and version = ?",
|
|
(template_id, version),
|
|
)
|
|
if row is None:
|
|
details = {"template_id": template_id}
|
|
if version is not None:
|
|
details["version"] = version
|
|
raise NotFoundError("Workflow template not found", details=details)
|
|
return WorkflowTemplate.from_dict(_loads(row["payload"]))
|
|
|
|
def list_workflow_templates(
|
|
self,
|
|
*,
|
|
template_id: str | None = None,
|
|
) -> list[WorkflowTemplate]:
|
|
if template_id is None:
|
|
rows = self._all("select payload from workflow_templates order by name, version, id", ())
|
|
else:
|
|
rows = self._all(
|
|
"select payload from workflow_templates where id = ? order by name, version, id",
|
|
(template_id,),
|
|
)
|
|
return [WorkflowTemplate.from_dict(_loads(row["payload"])) for row in rows]
|
|
|
|
def save_workflow_run(self, run: WorkflowRun) -> WorkflowRun:
|
|
try:
|
|
with self._connect() as conn:
|
|
conn.execute(
|
|
"""
|
|
insert into workflow_runs
|
|
(id, template_id, template_version, status, actor_id, correlation_id,
|
|
queued_at, updated_at, payload)
|
|
values (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
on conflict(id) do update set
|
|
template_id=excluded.template_id,
|
|
template_version=excluded.template_version,
|
|
status=excluded.status,
|
|
actor_id=excluded.actor_id,
|
|
correlation_id=excluded.correlation_id,
|
|
queued_at=excluded.queued_at,
|
|
updated_at=excluded.updated_at,
|
|
payload=excluded.payload
|
|
""",
|
|
(
|
|
run.run_id,
|
|
run.template_id,
|
|
run.template_version,
|
|
run.status.value,
|
|
run.actor_id,
|
|
run.correlation_id,
|
|
run.queued_at,
|
|
run.updated_at,
|
|
_json(run.to_dict()),
|
|
),
|
|
)
|
|
except sqlite3.IntegrityError as exc:
|
|
if _is_foreign_key_error(exc):
|
|
raise ValidationError(
|
|
"Workflow run references an unknown actor or template",
|
|
details={
|
|
"actor_id": run.actor_id,
|
|
"template_id": run.template_id,
|
|
"template_version": run.template_version,
|
|
"run_id": run.run_id,
|
|
},
|
|
) from exc
|
|
raise
|
|
return run
|
|
|
|
def get_workflow_run(self, run_id: str) -> WorkflowRun:
|
|
row = self._one("select payload from workflow_runs where id = ?", (run_id,))
|
|
if row is None:
|
|
raise NotFoundError("Workflow run not found", details={"run_id": run_id})
|
|
return WorkflowRun.from_dict(_loads(row["payload"]))
|
|
|
|
def list_workflow_runs(
|
|
self,
|
|
*,
|
|
status: WorkflowRunStatus | None = None,
|
|
template_id: str | None = None,
|
|
) -> list[WorkflowRun]:
|
|
clauses = []
|
|
params: list[Any] = []
|
|
if status is not None:
|
|
clauses.append("status = ?")
|
|
params.append(status.value)
|
|
if template_id is not None:
|
|
clauses.append("template_id = ?")
|
|
params.append(template_id)
|
|
where = f" where {' and '.join(clauses)}" if clauses else ""
|
|
rows = self._all(f"select payload from workflow_runs{where} order by queued_at, id", tuple(params))
|
|
return [WorkflowRun.from_dict(_loads(row["payload"])) for row in rows]
|
|
|
|
def _initialize(self) -> None:
|
|
with self._connect() as conn:
|
|
conn.executescript(
|
|
"""
|
|
create table if not exists actors (
|
|
id text primary key,
|
|
actor_type text not null,
|
|
payload text not null
|
|
);
|
|
create table if not exists assets (
|
|
id text primary key,
|
|
title text not null,
|
|
asset_type text not null,
|
|
lifecycle text not null,
|
|
payload text not null
|
|
);
|
|
create table if not exists representations (
|
|
id text primary key,
|
|
asset_id text not null references assets(id) on delete cascade,
|
|
kind text not null,
|
|
digest text not null,
|
|
payload text not null
|
|
);
|
|
create table if not exists metadata_records (
|
|
id text primary key,
|
|
asset_id text not null references assets(id) on delete cascade,
|
|
key text not null,
|
|
payload text not null
|
|
);
|
|
create table if not exists metadata_schemas (
|
|
id text primary key,
|
|
name text not null,
|
|
version text not null,
|
|
payload text not null
|
|
);
|
|
create table if not exists metadata_schema_assignments (
|
|
id text primary key,
|
|
schema_id text not null references metadata_schemas(id) on delete cascade,
|
|
priority integer not null,
|
|
payload text not null
|
|
);
|
|
create table if not exists context_entities (
|
|
id text primary key,
|
|
entity_type text not null,
|
|
name text not null,
|
|
payload text not null
|
|
);
|
|
create table if not exists core_relationships (
|
|
id text primary key,
|
|
source_id text not null references assets(id) on delete cascade,
|
|
target_id text not null,
|
|
target_kind text not null,
|
|
predicate text not null,
|
|
payload text not null
|
|
);
|
|
create table if not exists asset_versions (
|
|
id text primary key,
|
|
asset_id text not null references assets(id) on delete cascade,
|
|
sequence integer not null,
|
|
change_type text not null,
|
|
payload text not null,
|
|
unique(asset_id, sequence)
|
|
);
|
|
create table if not exists audit_events (
|
|
id text primary key,
|
|
target text not null,
|
|
actor_id text not null,
|
|
correlation_id text not null,
|
|
outcome text not null,
|
|
occurred_at text not null,
|
|
payload text not null,
|
|
foreign key(actor_id) references actors(id)
|
|
);
|
|
create table if not exists retrieval_feedback (
|
|
id text primary key,
|
|
label text not null,
|
|
actor_id text not null,
|
|
correlation_id text not null,
|
|
created_at text not null,
|
|
payload text not null,
|
|
foreign key(actor_id) references actors(id)
|
|
);
|
|
create table if not exists idempotency_records (
|
|
key text primary key,
|
|
operation text not null,
|
|
request_hash text not null,
|
|
status text not null,
|
|
payload text not null
|
|
);
|
|
create table if not exists ingestion_jobs (
|
|
id text primary key,
|
|
status text not null,
|
|
actor_id text not null,
|
|
correlation_id text not null,
|
|
created_at text not null,
|
|
updated_at text not null,
|
|
payload text not null,
|
|
foreign key(actor_id) references actors(id)
|
|
);
|
|
create table if not exists transformation_runs (
|
|
id text primary key,
|
|
operation_id text not null,
|
|
status text not null,
|
|
actor_id text not null,
|
|
correlation_id text not null,
|
|
queued_at text not null,
|
|
updated_at text not null,
|
|
payload text not null,
|
|
foreign key(actor_id) references actors(id)
|
|
);
|
|
create table if not exists derived_lineage (
|
|
id text primary key,
|
|
output_asset_id text not null references assets(id) on delete cascade,
|
|
transformation_run_id text not null references transformation_runs(id) on delete cascade,
|
|
payload text not null
|
|
);
|
|
create table if not exists workflow_templates (
|
|
id text not null,
|
|
version text not null,
|
|
name text not null,
|
|
updated_at text not null,
|
|
payload text not null,
|
|
primary key(id, version)
|
|
);
|
|
create table if not exists workflow_runs (
|
|
id text primary key,
|
|
template_id text not null,
|
|
template_version text not null,
|
|
status text not null,
|
|
actor_id text not null,
|
|
correlation_id text not null,
|
|
queued_at text not null,
|
|
updated_at text not null,
|
|
payload text not null,
|
|
foreign key(actor_id) references actors(id),
|
|
foreign key(template_id, template_version)
|
|
references workflow_templates(id, version)
|
|
);
|
|
create index if not exists idx_assets_lifecycle on assets(lifecycle);
|
|
create index if not exists idx_representations_asset on representations(asset_id);
|
|
create index if not exists idx_metadata_asset on metadata_records(asset_id);
|
|
create index if not exists idx_schema_assignments_schema on metadata_schema_assignments(schema_id);
|
|
create index if not exists idx_entities_type on context_entities(entity_type);
|
|
create index if not exists idx_relationships_source on core_relationships(source_id);
|
|
create index if not exists idx_relationships_target on core_relationships(target_id);
|
|
create index if not exists idx_versions_asset on asset_versions(asset_id);
|
|
create index if not exists idx_audit_target on audit_events(target);
|
|
create index if not exists idx_audit_correlation on audit_events(correlation_id);
|
|
create index if not exists idx_retrieval_feedback_label on retrieval_feedback(label);
|
|
create index if not exists idx_retrieval_feedback_correlation on retrieval_feedback(correlation_id);
|
|
create index if not exists idx_ingestion_jobs_status on ingestion_jobs(status);
|
|
create index if not exists idx_ingestion_jobs_correlation on ingestion_jobs(correlation_id);
|
|
create index if not exists idx_transformation_runs_status on transformation_runs(status);
|
|
create index if not exists idx_transformation_runs_operation on transformation_runs(operation_id);
|
|
create index if not exists idx_transformation_runs_correlation on transformation_runs(correlation_id);
|
|
create index if not exists idx_derived_lineage_output on derived_lineage(output_asset_id);
|
|
create index if not exists idx_derived_lineage_run on derived_lineage(transformation_run_id);
|
|
create index if not exists idx_workflow_templates_name on workflow_templates(name);
|
|
create index if not exists idx_workflow_runs_status on workflow_runs(status);
|
|
create index if not exists idx_workflow_runs_template on workflow_runs(template_id);
|
|
create index if not exists idx_workflow_runs_correlation on workflow_runs(correlation_id);
|
|
"""
|
|
)
|
|
|
|
def _connect(self) -> sqlite3.Connection:
|
|
conn = sqlite3.connect(self.path)
|
|
conn.row_factory = sqlite3.Row
|
|
conn.execute("pragma foreign_keys = on")
|
|
return conn
|
|
|
|
def _one(self, query: str, params: tuple[Any, ...]) -> sqlite3.Row | None:
|
|
with self._connect() as conn:
|
|
return conn.execute(query, params).fetchone()
|
|
|
|
def _all(self, query: str, params: tuple[Any, ...]) -> list[sqlite3.Row]:
|
|
with self._connect() as conn:
|
|
return list(conn.execute(query, params).fetchall())
|
|
|
|
def _metadata_records_for_assets(self, asset_ids: list[str]) -> dict[str, list[MetadataRecord]]:
|
|
if not asset_ids:
|
|
return {}
|
|
placeholders = ",".join("?" for _ in asset_ids)
|
|
rows = self._all(
|
|
f"""
|
|
select asset_id, payload from metadata_records
|
|
where asset_id in ({placeholders})
|
|
order by asset_id, key, id
|
|
""",
|
|
tuple(asset_ids),
|
|
)
|
|
records: dict[str, list[MetadataRecord]] = {}
|
|
for row in rows:
|
|
records.setdefault(row["asset_id"], []).append(MetadataRecord.from_dict(_loads(row["payload"])))
|
|
return records
|
|
|
|
|
|
def _json(value: dict[str, Any]) -> str:
|
|
return json.dumps(value, sort_keys=True, separators=(",", ":"))
|
|
|
|
|
|
def _is_foreign_key_error(exc: sqlite3.IntegrityError) -> bool:
|
|
return "FOREIGN KEY" in str(exc).upper()
|
|
|
|
|
|
def _loads(value: str) -> dict[str, Any]:
|
|
return json.loads(value)
|
|
|
|
|
|
def _metadata_matches(
|
|
records: list[MetadataRecord],
|
|
metadata_filters: dict[str, Any],
|
|
*,
|
|
confirmed_metadata_only: bool,
|
|
) -> bool:
|
|
for key, expected in metadata_filters.items():
|
|
candidates = [record for record in records if record.key == key]
|
|
if confirmed_metadata_only:
|
|
candidates = [record for record in candidates if record.confirmed]
|
|
if not any(_metadata_value_matches(record.value, expected) for record in candidates):
|
|
return False
|
|
return True
|
|
|
|
|
|
def _metadata_value_matches(value: Any, expected: Any) -> bool:
|
|
if isinstance(value, list) and not isinstance(expected, list):
|
|
return expected in value
|
|
return value == expected
|