Add guide-board pilot ingestion

This commit is contained in:
2026-05-17 00:09:11 +02:00
parent 1f379ba321
commit 91bb08c8e5
22 changed files with 1074 additions and 12 deletions

View File

@@ -52,6 +52,12 @@ class PackageCreate(BaseModel):
subject: str = Field(min_length=1)
retention_class: str = Field(min_length=1)
metadata: dict[str, Any] = Field(default_factory=dict)
metadata_schema_slug: str | None = None
class MetadataSchemaCreate(BaseModel):
slug: str = Field(min_length=1)
json_schema: dict[str, Any]
class UploadCreate(BaseModel):
@@ -224,6 +230,24 @@ def create_app(settings: Settings | None = None) -> FastAPI:
classes = await registry.list_retention_classes()
return {"retention_classes": [_retention_class_dict(c) for c in classes]}
@application.post("/metadata-schemas", status_code=status.HTTP_201_CREATED)
async def register_metadata_schema(
body: MetadataSchemaCreate,
_actor: str = Depends(require_write_auth),
registry: Registry = Depends(get_registry),
) -> dict[str, Any]:
schema_id = await registry.register_metadata_schema(
slug=body.slug,
json_schema=body.json_schema,
)
schema = await registry.get_metadata_schema(body.slug)
return {
"id": str(schema_id),
"slug": schema.slug,
"json_schema": schema.json_schema,
"created_at": _iso(schema.created_at),
}
@application.post("/packages", status_code=status.HTTP_201_CREATED)
async def create_package(
body: PackageCreate,
@@ -238,6 +262,7 @@ def create_app(settings: Settings | None = None) -> FastAPI:
retention_class=body.retention_class,
actor=actor,
metadata=body.metadata,
metadata_schema_slug=body.metadata_schema_slug,
)
return _package_dict(await registry.get_package(package_id))
except ValueError as exc:

View File

@@ -35,8 +35,10 @@ app = typer.Typer(
)
retention_app = typer.Typer(help="Retention lifecycle commands", no_args_is_help=True)
storage_app = typer.Typer(help="Storage backend commands", no_args_is_help=True)
guide_board_app = typer.Typer(help="Guide-board pilot commands", no_args_is_help=True)
app.add_typer(retention_app, name="retention")
app.add_typer(storage_app, name="storage")
app.add_typer(guide_board_app, name="guide-board")
@app.callback()
@@ -208,6 +210,28 @@ def storage_verify(
)
@guide_board_app.command("ingest")
def guide_board_ingest(
run_dir: Path = typer.Argument(
...,
exists=True,
file_okay=False,
dir_okay=True,
readable=True,
help="Guide-board run directory.",
),
schema_path: Path = typer.Option(
Path("schemas/guide-board.run.v1.json"),
"--schema",
help="Path to the guide-board metadata schema JSON.",
),
) -> None:
"""Ingest a guide-board run directory through the local registry."""
settings = get_settings()
result = asyncio.run(_guide_board_ingest_async(settings, run_dir, schema_path))
typer.echo(json.dumps(result, indent=2))
# ---- internals -------------------------------------------------------------
@@ -286,6 +310,34 @@ async def _storage_verify_async(
]
async def _guide_board_ingest_async(
settings: Settings,
run_dir: Path,
schema_path: Path,
) -> dict[str, Any]:
from artifactstore.app import build_registry
from artifactstore.pilots.guide_board import GUIDE_BOARD_SCHEMA_SLUG, ingest_run
registry: Registry = build_registry(settings)
try:
schema = json.loads(schema_path.read_text(encoding="utf-8"))
if not isinstance(schema, dict):
raise click.BadParameter(f"schema must be a JSON object: {schema_path}")
await registry.register_metadata_schema(
slug=GUIDE_BOARD_SCHEMA_SLUG,
json_schema=schema,
)
result = await ingest_run(run_dir, registry=registry)
finally:
await registry.dispose()
return {
"package_id": result.package_id,
"manifest_digest": result.manifest_digest,
"file_count": result.file_count,
"reused_existing": result.reused_existing,
}
def _http_json(
method: str,
base_url: str,

View File

@@ -68,7 +68,9 @@ async def _apply_package_created(connection: AsyncConnection, event: Event) -> N
producer=payload["producer"],
subject=payload["subject"],
retention_class=payload["retention_class"],
metadata_schema_id=None,
metadata_schema_id=UUID(payload["metadata_schema_id"])
if payload.get("metadata_schema_id")
else None,
metadata=payload.get("metadata", {}),
status="created",
manifest_digest=None,

View File

@@ -0,0 +1 @@
"""Pilot producer integrations."""

View File

@@ -0,0 +1,308 @@
"""Guide-board pilot ingestion helper."""
from __future__ import annotations
import json
import mimetypes
import subprocess
from collections.abc import AsyncIterator
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from artifactstore.registry import Registry
__all__ = ["GUIDE_BOARD_SCHEMA_SLUG", "GuideBoardIngestResult", "ingest_run"]
GUIDE_BOARD_SCHEMA_SLUG = "guide-board.run.v1"
CORE_RUN_PATHS = (
"run.json",
"retention-summary.json",
"plan.json",
"sources.lock.json",
"target-profile.snapshot.json",
"assessment-profile.snapshot.json",
"normalized/evidence.json",
"normalized/findings.json",
"normalized/mappings.json",
"reports/fragments.json",
"reports/submission-package.json",
"exports/export-manifest.json",
)
@dataclass(frozen=True, slots=True)
class GuideBoardIngestResult:
package_id: str
manifest_digest: str
file_count: int
reused_existing: bool = False
async def ingest_run(
run_dir: str | Path,
*,
registry: Registry,
actor: str = "guide-board",
metadata_schema_slug: str = GUIDE_BOARD_SCHEMA_SLUG,
) -> GuideBoardIngestResult:
"""Ingest one guide-board run directory into artifact-store."""
root = Path(run_dir)
run_json = _read_json(root / "run.json")
retention_summary = _read_json(root / "retention-summary.json")
source_lock = _read_json_if_exists(root / "sources.lock.json")
package_manifest_path = root / "reports" / "assessment-package.json"
package_manifest = _read_json(package_manifest_path)
metadata = _metadata(run_json, retention_summary, source_lock)
run_id = str(metadata["run_id"])
existing = await registry.list_packages(
producer="guide-board",
metadata_key="run_id",
metadata_value=run_id,
)
for package in existing:
if package.status == "finalized" and package.manifest_digest_hex:
return GuideBoardIngestResult(
package_id=str(package.id),
manifest_digest=f"blake3:{package.manifest_digest_hex}",
file_count=0,
reused_existing=True,
)
package_id = await registry.create_package(
name=f"guide-board run {run_id}",
producer="guide-board",
subject=str(metadata["target_profile_ref"]),
retention_class=str(retention_summary.get("retention_class", "release-evidence")),
actor=actor,
metadata=metadata,
metadata_schema_slug=metadata_schema_slug,
)
paths = _declared_paths(package_manifest)
paths.update(_retained_report_paths(retention_summary))
paths.add("reports/assessment-package.json")
for rel_path in CORE_RUN_PATHS:
if (root / rel_path).is_file():
paths.add(rel_path)
for rel_path in sorted(paths):
source = root / rel_path
await registry.ingest_file(
package_id,
relative_path=rel_path,
media_type=mimetypes.guess_type(source.name)[0] or "application/octet-stream",
stream=_file_chunks(source),
actor=actor,
)
await registry.finalize_package(package_id, actor=actor)
package = await registry.get_package(package_id)
if package.manifest_digest_hex is None:
raise RuntimeError(f"package {package_id} finalized without manifest digest")
return GuideBoardIngestResult(
package_id=str(package_id),
manifest_digest=f"blake3:{package.manifest_digest_hex}",
file_count=len(paths),
)
def _metadata(
run_json: dict[str, Any],
retention_summary: dict[str, Any],
source_lock: dict[str, Any] | None,
) -> dict[str, Any]:
summary = retention_summary.get("summary", {})
if not isinstance(summary, dict):
summary = {}
return {
"run_id": str(run_json.get("run_id") or run_json.get("id") or retention_summary["run_id"]),
"target_profile_ref": str(run_json["target_profile_ref"]),
"assessment_profile_ref": str(run_json["assessment_profile_ref"]),
"result_status": str(
run_json.get("result_status") or run_json.get("status") or summary.get("status")
),
"source_commits": _source_commits(run_json, source_lock),
"report_paths": sorted(_retained_report_paths(retention_summary)),
"evidence_counts": _evidence_counts(retention_summary, summary),
"finding_counts": _finding_counts(retention_summary, summary),
}
def _declared_paths(package_manifest: dict[str, Any]) -> set[str]:
paths: set[str] = set()
raw_files = package_manifest.get("files", [])
if raw_files is not None and not isinstance(raw_files, list):
raise ValueError("assessment-package.json 'files' must be a list")
for entry in raw_files or []:
if isinstance(entry, str):
paths.add(entry)
elif isinstance(entry, dict) and isinstance(entry.get("path"), str):
paths.add(entry["path"])
else:
raise ValueError(f"invalid assessment package file entry: {entry!r}")
raw_artifacts = package_manifest.get("artifact_manifest", [])
if raw_artifacts is not None and not isinstance(raw_artifacts, list):
raise ValueError("assessment-package.json 'artifact_manifest' must be a list")
for entry in raw_artifacts or []:
if isinstance(entry, dict) and isinstance(entry.get("path"), str):
paths.add(entry["path"])
else:
raise ValueError(f"invalid assessment package artifact entry: {entry!r}")
return paths
def _retained_report_paths(retention_summary: dict[str, Any]) -> set[str]:
paths: set[str] = set()
for key in ("report_paths", "report_refs", "export_refs"):
raw_paths = retention_summary.get(key, [])
if not isinstance(raw_paths, list):
continue
paths.update(path for path in raw_paths if isinstance(path, str) and path)
return paths
def _source_commits(
run_json: dict[str, Any],
source_lock: dict[str, Any] | None,
) -> dict[str, str]:
raw = run_json.get("source_commits")
if isinstance(raw, dict):
return {str(key): str(value) for key, value in raw.items()}
commits: dict[str, str] = {}
if source_lock is not None:
for label, path in _source_paths(source_lock).items():
commit = _git_head(path)
if commit is not None:
commits[label] = commit
if commits:
return commits
fingerprints = _source_fingerprints(source_lock)
if fingerprints:
return fingerprints
return {"unknown": "unrecorded-source"}
def _source_paths(source_lock: dict[str, Any]) -> dict[str, Path]:
paths: dict[str, Path] = {}
profiles = source_lock.get("profiles", {})
if isinstance(profiles, dict):
for key, value in profiles.items():
if isinstance(value, dict) and isinstance(value.get("path"), str):
paths[f"profile:{key}"] = Path(value["path"])
extensions = source_lock.get("extensions", [])
if isinstance(extensions, list):
for entry in extensions:
if not isinstance(entry, dict):
continue
extension_id = str(entry.get("id") or "unknown-extension")
raw_path = entry.get("path")
if isinstance(raw_path, str) and Path(raw_path).is_absolute():
paths[f"extension:{extension_id}"] = Path(raw_path)
return paths
def _git_head(path: Path) -> str | None:
try:
completed = subprocess.run(
["git", "-C", str(path.parent if path.is_file() else path), "rev-parse", "HEAD"],
check=True,
capture_output=True,
text=True,
timeout=5,
)
except (OSError, subprocess.CalledProcessError, subprocess.TimeoutExpired):
return None
commit = completed.stdout.strip()
return commit or None
def _source_fingerprints(source_lock: dict[str, Any]) -> dict[str, str]:
fingerprints: dict[str, str] = {}
for key, value in source_lock.items():
if key == "id" and isinstance(value, str):
fingerprints["source_lock"] = value
profiles = source_lock.get("profiles", {})
if isinstance(profiles, dict):
for key, value in profiles.items():
if isinstance(value, dict) and isinstance(value.get("checksum"), str):
fingerprints[f"profile:{key}"] = value["checksum"]
extensions = source_lock.get("extensions", [])
if isinstance(extensions, list):
for entry in extensions:
if isinstance(entry, dict) and isinstance(entry.get("manifest_checksum"), str):
fingerprints[f"extension:{entry.get('id', 'unknown-extension')}"] = entry[
"manifest_checksum"
]
return fingerprints
def _evidence_counts(
retention_summary: dict[str, Any],
summary: dict[str, Any],
) -> dict[str, int]:
raw = retention_summary.get("evidence_counts")
if isinstance(raw, dict):
return _int_mapping(raw)
raw_evidence = summary.get("evidence_results")
if isinstance(raw_evidence, dict):
return _int_mapping(raw_evidence)
return {}
def _finding_counts(
retention_summary: dict[str, Any],
summary: dict[str, Any],
) -> dict[str, int]:
raw = retention_summary.get("finding_counts")
if isinstance(raw, dict):
return _int_mapping(raw)
keys = (
"finding_count",
"unexpected_findings",
"expected_findings",
"waived_findings",
"challenged_findings",
"authority_exclusions",
"unresolved_defects",
"unresolved_review_items",
)
return _int_mapping({key: summary[key] for key in keys if key in summary})
def _int_mapping(raw: dict[str, Any]) -> dict[str, int]:
return {
str(key): int(value)
for key, value in raw.items()
if isinstance(value, int) and not isinstance(value, bool)
}
def _read_json(path: Path) -> dict[str, Any]:
with path.open("r", encoding="utf-8") as fh:
payload = json.load(fh)
if not isinstance(payload, dict):
raise ValueError(f"{path} must contain a JSON object")
return payload
def _read_json_if_exists(path: Path) -> dict[str, Any] | None:
if not path.exists():
return None
return _read_json(path)
async def _file_chunks(path: Path, chunk_size: int = 64 * 1024) -> AsyncIterator[bytes]:
with path.open("rb") as fh:
while True:
chunk = fh.read(chunk_size)
if not chunk:
break
yield chunk

View File

@@ -33,6 +33,7 @@ from artifactstore.dataplane.spi import DataPlane, IngestHints
from artifactstore.db.schema import (
artifact_files,
artifact_packages,
metadata_schemas,
retention_classes,
retention_state,
storage_locations,
@@ -70,6 +71,7 @@ __all__ = [
"FileNotFoundError",
"FileRecord",
"IllegalPackageStateError",
"MetadataSchemaRecord",
"PackageNotFoundError",
"PackageRecord",
"Registry",
@@ -100,6 +102,16 @@ class RetentionStateError(ValueError):
"""Raised when a retention lifecycle operation is invalid."""
@dataclass(frozen=True, slots=True)
class MetadataSchemaRecord:
"""Registered package metadata schema."""
id: UUID
slug: str
json_schema: dict[str, Any]
created_at: datetime | None
@dataclass(frozen=True, slots=True)
class PackageRecord:
"""Materialised package row projected into the registry API."""
@@ -208,9 +220,15 @@ class Registry:
retention_class: str,
actor: str,
metadata: dict[str, Any] | None = None,
metadata_schema_slug: str | None = None,
) -> UUID:
"""Create a new package; returns its ``UUID``."""
retention_class_row = await self._get_retention_class(retention_class)
package_metadata = metadata or {}
metadata_schema_id = await self._validate_metadata_schema(
metadata_schema_slug,
package_metadata,
)
package_id = uuid.uuid4()
payload = cbor2.dumps(
{
@@ -218,7 +236,8 @@ class Registry:
"producer": producer,
"subject": subject,
"retention_class": retention_class,
"metadata": metadata or {},
"metadata": package_metadata,
"metadata_schema_id": str(metadata_schema_id) if metadata_schema_id else None,
},
canonical=True,
)
@@ -513,6 +532,48 @@ class Registry:
for r in rows
]
async def register_metadata_schema(
self,
*,
slug: str,
json_schema: dict[str, Any],
) -> UUID:
"""Register a package metadata JSON Schema, idempotent by slug."""
schema_id = uuid.uuid4()
async with self._engine.begin() as conn:
existing = (
await conn.execute(
select(metadata_schemas.c.id).where(metadata_schemas.c.slug == slug)
)
).first()
if existing is not None:
return UUID(str(existing.id))
await conn.execute(
metadata_schemas.insert().values(
id=schema_id,
slug=slug,
json_schema=json_schema,
)
)
return schema_id
async def get_metadata_schema(self, slug: str) -> MetadataSchemaRecord:
"""Return one registered metadata schema by slug."""
async with self._engine.connect() as conn:
row = (
await conn.execute(
select(metadata_schemas).where(metadata_schemas.c.slug == slug)
)
).first()
if row is None:
raise KeyError(f"metadata schema not found: {slug}")
return MetadataSchemaRecord(
id=row.id,
slug=row.slug,
json_schema=dict(row.json_schema),
created_at=row.created_at,
)
async def get_retention_state(self, package_id: UUID) -> RetentionStateRecord:
"""Return the retention materialised view for one package."""
async with self._engine.connect() as conn:
@@ -902,6 +963,25 @@ class Registry:
deletion_strategy=row.deletion_strategy,
)
async def _validate_metadata_schema(
self,
slug: str | None,
metadata: dict[str, Any],
) -> UUID | None:
if slug is None:
return None
try:
schema = await self.get_metadata_schema(slug)
except KeyError as exc:
raise ValueError(str(exc)) from exc
required = schema.json_schema.get("required", [])
if not isinstance(required, list):
raise ValueError(f"metadata schema {slug!r} has invalid required list")
missing = [key for key in required if isinstance(key, str) and key not in metadata]
if missing:
raise ValueError(f"metadata missing required schema keys: {', '.join(missing)}")
return schema.id
def _iso(value: datetime | None) -> str | None:
if value is None: