metadata schema validation

This commit is contained in:
2026-05-06 02:50:08 +02:00
parent 565a5643a3
commit c271385e35
8 changed files with 415 additions and 7 deletions

View File

@@ -36,7 +36,11 @@ from .core import (
IngestionJobStatus,
KnowledgeAsset,
LifecycleState,
MetadataFieldDefinition,
MetadataRecord,
MetadataSchema,
MetadataValidationIssue,
MetadataValueType,
NormalizedDocument,
OperationContext,
PolicyDecision,
@@ -135,7 +139,11 @@ __all__ = [
"KnowledgeAsset",
"KontextualError",
"LifecycleState",
"MetadataFieldDefinition",
"MetadataRecord",
"MetadataSchema",
"MetadataValidationIssue",
"MetadataValueType",
"NormalizedDocument",
"NotFoundError",
"OperationRun",

View File

@@ -14,7 +14,16 @@ from .ingestion import (
NormalizedDocument,
SourcePayload,
)
from .metadata import Classification, LifecycleState, MetadataRecord, Sensitivity
from .metadata import (
Classification,
LifecycleState,
MetadataFieldDefinition,
MetadataRecord,
MetadataSchema,
MetadataValidationIssue,
MetadataValueType,
Sensitivity,
)
from .policy import PolicyDecision, PolicyEffect
from .primitives import content_digest, mapping_digest, new_id, stable_json_dumps, utc_now
from .provenance import (
@@ -52,7 +61,11 @@ __all__ = [
"IngestionJobStatus",
"KnowledgeAsset",
"LifecycleState",
"MetadataFieldDefinition",
"MetadataRecord",
"MetadataSchema",
"MetadataValidationIssue",
"MetadataValueType",
"NormalizedDocument",
"OperationContext",
"PolicyDecision",

View File

@@ -25,6 +25,271 @@ class Sensitivity(str, Enum):
RESTRICTED = "restricted"
class MetadataValueType(str, Enum):
STRING = "string"
INTEGER = "integer"
NUMBER = "number"
BOOLEAN = "boolean"
LIST = "list"
OBJECT = "object"
@dataclass(frozen=True)
class MetadataValidationIssue:
code: str
message: str
key: str | None = None
details: dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
return compact_dict(
{
"code": self.code,
"message": self.message,
"key": self.key,
"details": dict(self.details),
}
)
@dataclass(frozen=True)
class MetadataFieldDefinition:
key: str
value_type: MetadataValueType | str
required: bool = False
allow_multiple: bool = False
allowed_values: tuple[Any, ...] = ()
min_value: int | float | None = None
max_value: int | float | None = None
min_length: int | None = None
max_length: int | None = None
require_confirmed: bool = False
description: str | None = None
metadata: dict[str, Any] = field(default_factory=dict)
def __post_init__(self) -> None:
object.__setattr__(self, "value_type", MetadataValueType(self.value_type))
object.__setattr__(self, "allowed_values", tuple(self.allowed_values))
def validate(self, records: list["MetadataRecord"]) -> list[MetadataValidationIssue]:
issues: list[MetadataValidationIssue] = []
if self.required and not records:
issues.append(
MetadataValidationIssue(
code="metadata.required_missing",
key=self.key,
message="Required metadata field is missing",
)
)
return issues
if not self.allow_multiple and len(records) > 1:
issues.append(
MetadataValidationIssue(
code="metadata.multiple_not_allowed",
key=self.key,
message="Metadata field does not allow multiple values",
details={"count": len(records)},
)
)
for record in records:
issues.extend(self._validate_record(record))
return issues
def _validate_record(self, record: "MetadataRecord") -> list[MetadataValidationIssue]:
issues: list[MetadataValidationIssue] = []
value = record.value
if not _matches_type(value, self.value_type):
issues.append(
MetadataValidationIssue(
code="metadata.type_mismatch",
key=self.key,
message="Metadata value does not match schema type",
details={
"expected": self.value_type.value,
"actual": type(value).__name__,
},
)
)
return issues
if self.require_confirmed and not record.confirmed:
issues.append(
MetadataValidationIssue(
code="metadata.confirmation_required",
key=self.key,
message="Metadata field requires confirmed value",
)
)
if self.allowed_values and value not in self.allowed_values:
issues.append(
MetadataValidationIssue(
code="metadata.value_not_allowed",
key=self.key,
message="Metadata value is not in the allowed set",
details={"allowed_values": list(self.allowed_values), "value": value},
)
)
if isinstance(value, (int, float)) and not isinstance(value, bool):
if self.min_value is not None and value < self.min_value:
issues.append(
MetadataValidationIssue(
code="metadata.value_too_small",
key=self.key,
message="Metadata numeric value is below minimum",
details={"min_value": self.min_value, "value": value},
)
)
if self.max_value is not None and value > self.max_value:
issues.append(
MetadataValidationIssue(
code="metadata.value_too_large",
key=self.key,
message="Metadata numeric value is above maximum",
details={"max_value": self.max_value, "value": value},
)
)
if isinstance(value, (str, list, dict)):
length = len(value)
if self.min_length is not None and length < self.min_length:
issues.append(
MetadataValidationIssue(
code="metadata.length_too_short",
key=self.key,
message="Metadata value length is below minimum",
details={"min_length": self.min_length, "length": length},
)
)
if self.max_length is not None and length > self.max_length:
issues.append(
MetadataValidationIssue(
code="metadata.length_too_long",
key=self.key,
message="Metadata value length is above maximum",
details={"max_length": self.max_length, "length": length},
)
)
return issues
def to_dict(self) -> dict[str, Any]:
return compact_dict(
{
"key": self.key,
"value_type": self.value_type.value,
"required": self.required,
"allow_multiple": self.allow_multiple,
"allowed_values": list(self.allowed_values),
"min_value": self.min_value,
"max_value": self.max_value,
"min_length": self.min_length,
"max_length": self.max_length,
"require_confirmed": self.require_confirmed,
"description": self.description,
"metadata": dict(self.metadata),
}
)
@classmethod
def from_dict(cls, data: dict[str, Any]) -> "MetadataFieldDefinition":
return cls(
key=data["key"],
value_type=MetadataValueType(data["value_type"]),
required=bool(data.get("required", False)),
allow_multiple=bool(data.get("allow_multiple", False)),
allowed_values=tuple(data.get("allowed_values", [])),
min_value=data.get("min_value"),
max_value=data.get("max_value"),
min_length=data.get("min_length"),
max_length=data.get("max_length"),
require_confirmed=bool(data.get("require_confirmed", False)),
description=data.get("description"),
metadata=dict(data.get("metadata", {})),
)
@dataclass(frozen=True)
class MetadataSchema:
schema_id: str
name: str
fields: tuple[MetadataFieldDefinition, ...]
allow_unknown: bool = True
asset_types: tuple[str, ...] = ()
version: str = "1"
description: str | None = None
metadata: dict[str, Any] = field(default_factory=dict)
def __post_init__(self) -> None:
object.__setattr__(self, "fields", tuple(self.fields))
object.__setattr__(self, "asset_types", tuple(self.asset_types))
def applies_to(self, classification: "Classification") -> bool:
return not self.asset_types or classification.asset_type in self.asset_types
def validate(
self,
records: list["MetadataRecord"] | tuple["MetadataRecord", ...],
) -> list[MetadataValidationIssue]:
by_key: dict[str, list[MetadataRecord]] = {}
for record in records:
by_key.setdefault(record.key, []).append(record)
issues: list[MetadataValidationIssue] = []
known_keys = {field.key for field in self.fields}
for field_definition in self.fields:
issues.extend(field_definition.validate(by_key.get(field_definition.key, [])))
if not self.allow_unknown:
for key in sorted(set(by_key) - known_keys):
issues.append(
MetadataValidationIssue(
code="metadata.unknown_field",
key=key,
message="Metadata field is not allowed by schema",
)
)
return issues
def validate_or_raise(
self,
records: list["MetadataRecord"] | tuple["MetadataRecord", ...],
) -> None:
from kontextual_engine.errors import ValidationError
issues = self.validate(records)
if issues:
raise ValidationError(
"Metadata schema validation failed",
details={
"schema_id": self.schema_id,
"issues": [issue.to_dict() for issue in issues],
},
)
def to_dict(self) -> dict[str, Any]:
return compact_dict(
{
"schema_id": self.schema_id,
"name": self.name,
"fields": [field_definition.to_dict() for field_definition in self.fields],
"allow_unknown": self.allow_unknown,
"asset_types": list(self.asset_types),
"version": self.version,
"description": self.description,
"metadata": dict(self.metadata),
}
)
@classmethod
def from_dict(cls, data: dict[str, Any]) -> "MetadataSchema":
return cls(
schema_id=data["schema_id"],
name=data["name"],
fields=tuple(MetadataFieldDefinition.from_dict(item) for item in data.get("fields", [])),
allow_unknown=bool(data.get("allow_unknown", True)),
asset_types=tuple(data.get("asset_types", [])),
version=str(data.get("version", "1")),
description=data.get("description"),
metadata=dict(data.get("metadata", {})),
)
@dataclass(frozen=True)
class Classification:
asset_type: str
@@ -95,3 +360,19 @@ class MetadataRecord:
confirmed=bool(data.get("confirmed", False)),
created_at=data["created_at"],
)
def _matches_type(value: Any, value_type: MetadataValueType) -> bool:
if value_type == MetadataValueType.STRING:
return isinstance(value, str)
if value_type == MetadataValueType.INTEGER:
return isinstance(value, int) and not isinstance(value, bool)
if value_type == MetadataValueType.NUMBER:
return isinstance(value, (int, float)) and not isinstance(value, bool)
if value_type == MetadataValueType.BOOLEAN:
return isinstance(value, bool)
if value_type == MetadataValueType.LIST:
return isinstance(value, list)
if value_type == MetadataValueType.OBJECT:
return isinstance(value, dict)
return False

View File

@@ -17,6 +17,7 @@ from kontextual_engine.core import (
LifecycleState,
mapping_digest,
MetadataRecord,
MetadataSchema,
OperationContext,
PolicyDecision,
RelationshipTargetKind,
@@ -49,9 +50,11 @@ class AssetRegistryService:
repository: AssetRegistryRepository,
*,
policy_gateway: PolicyGateway | None = None,
metadata_schemas: list[MetadataSchema] | tuple[MetadataSchema, ...] | None = None,
) -> None:
self.repository = repository
self.policy_gateway = policy_gateway or AllowAllPolicyGateway()
self.metadata_schemas = tuple(metadata_schemas or ())
def create_asset(
self,
@@ -79,6 +82,7 @@ class AssetRegistryService:
existing = self._idempotent_lookup("asset.create", idempotency_key, request_hash)
if existing:
return self._asset_change_from_idempotency(existing)
self._validate_metadata_records(classification, list(metadata_records or []))
asset = KnowledgeAsset.create(
title,
@@ -145,6 +149,10 @@ class AssetRegistryService:
asset = self.repository.get_asset(asset_id)
decision = self._authorize(context, "asset.metadata.add", f"asset:{asset.id}")
next_sequence = self._next_sequence(asset.id)
self._validate_metadata_records(
asset.classification,
self.repository.list_metadata_records(asset.id) + [record],
)
self.repository.save_metadata_record(asset.id, record)
version = AssetVersion(
asset_id=asset.id,
@@ -404,6 +412,15 @@ class AssetRegistryService:
versions = self.repository.list_versions(asset_id)
return len(versions) + 1
def _validate_metadata_records(
self,
classification: Classification,
records: list[MetadataRecord],
) -> None:
for schema in self.metadata_schemas:
if schema.applies_to(classification):
schema.validate_or_raise(records)
def _idempotent_lookup(
self,
operation: str,