generated from coulomb/repo-seed
Add guide-board pilot ingestion
This commit is contained in:
@@ -52,6 +52,12 @@ class PackageCreate(BaseModel):
|
||||
subject: str = Field(min_length=1)
|
||||
retention_class: str = Field(min_length=1)
|
||||
metadata: dict[str, Any] = Field(default_factory=dict)
|
||||
metadata_schema_slug: str | None = None
|
||||
|
||||
|
||||
class MetadataSchemaCreate(BaseModel):
|
||||
slug: str = Field(min_length=1)
|
||||
json_schema: dict[str, Any]
|
||||
|
||||
|
||||
class UploadCreate(BaseModel):
|
||||
@@ -224,6 +230,24 @@ def create_app(settings: Settings | None = None) -> FastAPI:
|
||||
classes = await registry.list_retention_classes()
|
||||
return {"retention_classes": [_retention_class_dict(c) for c in classes]}
|
||||
|
||||
@application.post("/metadata-schemas", status_code=status.HTTP_201_CREATED)
|
||||
async def register_metadata_schema(
|
||||
body: MetadataSchemaCreate,
|
||||
_actor: str = Depends(require_write_auth),
|
||||
registry: Registry = Depends(get_registry),
|
||||
) -> dict[str, Any]:
|
||||
schema_id = await registry.register_metadata_schema(
|
||||
slug=body.slug,
|
||||
json_schema=body.json_schema,
|
||||
)
|
||||
schema = await registry.get_metadata_schema(body.slug)
|
||||
return {
|
||||
"id": str(schema_id),
|
||||
"slug": schema.slug,
|
||||
"json_schema": schema.json_schema,
|
||||
"created_at": _iso(schema.created_at),
|
||||
}
|
||||
|
||||
@application.post("/packages", status_code=status.HTTP_201_CREATED)
|
||||
async def create_package(
|
||||
body: PackageCreate,
|
||||
@@ -238,6 +262,7 @@ def create_app(settings: Settings | None = None) -> FastAPI:
|
||||
retention_class=body.retention_class,
|
||||
actor=actor,
|
||||
metadata=body.metadata,
|
||||
metadata_schema_slug=body.metadata_schema_slug,
|
||||
)
|
||||
return _package_dict(await registry.get_package(package_id))
|
||||
except ValueError as exc:
|
||||
|
||||
@@ -35,8 +35,10 @@ app = typer.Typer(
|
||||
)
|
||||
retention_app = typer.Typer(help="Retention lifecycle commands", no_args_is_help=True)
|
||||
storage_app = typer.Typer(help="Storage backend commands", no_args_is_help=True)
|
||||
guide_board_app = typer.Typer(help="Guide-board pilot commands", no_args_is_help=True)
|
||||
app.add_typer(retention_app, name="retention")
|
||||
app.add_typer(storage_app, name="storage")
|
||||
app.add_typer(guide_board_app, name="guide-board")
|
||||
|
||||
|
||||
@app.callback()
|
||||
@@ -208,6 +210,28 @@ def storage_verify(
|
||||
)
|
||||
|
||||
|
||||
@guide_board_app.command("ingest")
|
||||
def guide_board_ingest(
|
||||
run_dir: Path = typer.Argument(
|
||||
...,
|
||||
exists=True,
|
||||
file_okay=False,
|
||||
dir_okay=True,
|
||||
readable=True,
|
||||
help="Guide-board run directory.",
|
||||
),
|
||||
schema_path: Path = typer.Option(
|
||||
Path("schemas/guide-board.run.v1.json"),
|
||||
"--schema",
|
||||
help="Path to the guide-board metadata schema JSON.",
|
||||
),
|
||||
) -> None:
|
||||
"""Ingest a guide-board run directory through the local registry."""
|
||||
settings = get_settings()
|
||||
result = asyncio.run(_guide_board_ingest_async(settings, run_dir, schema_path))
|
||||
typer.echo(json.dumps(result, indent=2))
|
||||
|
||||
|
||||
# ---- internals -------------------------------------------------------------
|
||||
|
||||
|
||||
@@ -286,6 +310,34 @@ async def _storage_verify_async(
|
||||
]
|
||||
|
||||
|
||||
async def _guide_board_ingest_async(
|
||||
settings: Settings,
|
||||
run_dir: Path,
|
||||
schema_path: Path,
|
||||
) -> dict[str, Any]:
|
||||
from artifactstore.app import build_registry
|
||||
from artifactstore.pilots.guide_board import GUIDE_BOARD_SCHEMA_SLUG, ingest_run
|
||||
|
||||
registry: Registry = build_registry(settings)
|
||||
try:
|
||||
schema = json.loads(schema_path.read_text(encoding="utf-8"))
|
||||
if not isinstance(schema, dict):
|
||||
raise click.BadParameter(f"schema must be a JSON object: {schema_path}")
|
||||
await registry.register_metadata_schema(
|
||||
slug=GUIDE_BOARD_SCHEMA_SLUG,
|
||||
json_schema=schema,
|
||||
)
|
||||
result = await ingest_run(run_dir, registry=registry)
|
||||
finally:
|
||||
await registry.dispose()
|
||||
return {
|
||||
"package_id": result.package_id,
|
||||
"manifest_digest": result.manifest_digest,
|
||||
"file_count": result.file_count,
|
||||
"reused_existing": result.reused_existing,
|
||||
}
|
||||
|
||||
|
||||
def _http_json(
|
||||
method: str,
|
||||
base_url: str,
|
||||
|
||||
@@ -68,7 +68,9 @@ async def _apply_package_created(connection: AsyncConnection, event: Event) -> N
|
||||
producer=payload["producer"],
|
||||
subject=payload["subject"],
|
||||
retention_class=payload["retention_class"],
|
||||
metadata_schema_id=None,
|
||||
metadata_schema_id=UUID(payload["metadata_schema_id"])
|
||||
if payload.get("metadata_schema_id")
|
||||
else None,
|
||||
metadata=payload.get("metadata", {}),
|
||||
status="created",
|
||||
manifest_digest=None,
|
||||
|
||||
1
src/artifactstore/pilots/__init__.py
Normal file
1
src/artifactstore/pilots/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Pilot producer integrations."""
|
||||
308
src/artifactstore/pilots/guide_board.py
Normal file
308
src/artifactstore/pilots/guide_board.py
Normal file
@@ -0,0 +1,308 @@
|
||||
"""Guide-board pilot ingestion helper."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import mimetypes
|
||||
import subprocess
|
||||
from collections.abc import AsyncIterator
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from artifactstore.registry import Registry
|
||||
|
||||
__all__ = ["GUIDE_BOARD_SCHEMA_SLUG", "GuideBoardIngestResult", "ingest_run"]
|
||||
|
||||
GUIDE_BOARD_SCHEMA_SLUG = "guide-board.run.v1"
|
||||
CORE_RUN_PATHS = (
|
||||
"run.json",
|
||||
"retention-summary.json",
|
||||
"plan.json",
|
||||
"sources.lock.json",
|
||||
"target-profile.snapshot.json",
|
||||
"assessment-profile.snapshot.json",
|
||||
"normalized/evidence.json",
|
||||
"normalized/findings.json",
|
||||
"normalized/mappings.json",
|
||||
"reports/fragments.json",
|
||||
"reports/submission-package.json",
|
||||
"exports/export-manifest.json",
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class GuideBoardIngestResult:
|
||||
package_id: str
|
||||
manifest_digest: str
|
||||
file_count: int
|
||||
reused_existing: bool = False
|
||||
|
||||
|
||||
async def ingest_run(
|
||||
run_dir: str | Path,
|
||||
*,
|
||||
registry: Registry,
|
||||
actor: str = "guide-board",
|
||||
metadata_schema_slug: str = GUIDE_BOARD_SCHEMA_SLUG,
|
||||
) -> GuideBoardIngestResult:
|
||||
"""Ingest one guide-board run directory into artifact-store."""
|
||||
root = Path(run_dir)
|
||||
run_json = _read_json(root / "run.json")
|
||||
retention_summary = _read_json(root / "retention-summary.json")
|
||||
source_lock = _read_json_if_exists(root / "sources.lock.json")
|
||||
package_manifest_path = root / "reports" / "assessment-package.json"
|
||||
package_manifest = _read_json(package_manifest_path)
|
||||
|
||||
metadata = _metadata(run_json, retention_summary, source_lock)
|
||||
run_id = str(metadata["run_id"])
|
||||
existing = await registry.list_packages(
|
||||
producer="guide-board",
|
||||
metadata_key="run_id",
|
||||
metadata_value=run_id,
|
||||
)
|
||||
for package in existing:
|
||||
if package.status == "finalized" and package.manifest_digest_hex:
|
||||
return GuideBoardIngestResult(
|
||||
package_id=str(package.id),
|
||||
manifest_digest=f"blake3:{package.manifest_digest_hex}",
|
||||
file_count=0,
|
||||
reused_existing=True,
|
||||
)
|
||||
|
||||
package_id = await registry.create_package(
|
||||
name=f"guide-board run {run_id}",
|
||||
producer="guide-board",
|
||||
subject=str(metadata["target_profile_ref"]),
|
||||
retention_class=str(retention_summary.get("retention_class", "release-evidence")),
|
||||
actor=actor,
|
||||
metadata=metadata,
|
||||
metadata_schema_slug=metadata_schema_slug,
|
||||
)
|
||||
|
||||
paths = _declared_paths(package_manifest)
|
||||
paths.update(_retained_report_paths(retention_summary))
|
||||
paths.add("reports/assessment-package.json")
|
||||
for rel_path in CORE_RUN_PATHS:
|
||||
if (root / rel_path).is_file():
|
||||
paths.add(rel_path)
|
||||
for rel_path in sorted(paths):
|
||||
source = root / rel_path
|
||||
await registry.ingest_file(
|
||||
package_id,
|
||||
relative_path=rel_path,
|
||||
media_type=mimetypes.guess_type(source.name)[0] or "application/octet-stream",
|
||||
stream=_file_chunks(source),
|
||||
actor=actor,
|
||||
)
|
||||
|
||||
await registry.finalize_package(package_id, actor=actor)
|
||||
package = await registry.get_package(package_id)
|
||||
if package.manifest_digest_hex is None:
|
||||
raise RuntimeError(f"package {package_id} finalized without manifest digest")
|
||||
return GuideBoardIngestResult(
|
||||
package_id=str(package_id),
|
||||
manifest_digest=f"blake3:{package.manifest_digest_hex}",
|
||||
file_count=len(paths),
|
||||
)
|
||||
|
||||
|
||||
def _metadata(
|
||||
run_json: dict[str, Any],
|
||||
retention_summary: dict[str, Any],
|
||||
source_lock: dict[str, Any] | None,
|
||||
) -> dict[str, Any]:
|
||||
summary = retention_summary.get("summary", {})
|
||||
if not isinstance(summary, dict):
|
||||
summary = {}
|
||||
return {
|
||||
"run_id": str(run_json.get("run_id") or run_json.get("id") or retention_summary["run_id"]),
|
||||
"target_profile_ref": str(run_json["target_profile_ref"]),
|
||||
"assessment_profile_ref": str(run_json["assessment_profile_ref"]),
|
||||
"result_status": str(
|
||||
run_json.get("result_status") or run_json.get("status") or summary.get("status")
|
||||
),
|
||||
"source_commits": _source_commits(run_json, source_lock),
|
||||
"report_paths": sorted(_retained_report_paths(retention_summary)),
|
||||
"evidence_counts": _evidence_counts(retention_summary, summary),
|
||||
"finding_counts": _finding_counts(retention_summary, summary),
|
||||
}
|
||||
|
||||
|
||||
def _declared_paths(package_manifest: dict[str, Any]) -> set[str]:
|
||||
paths: set[str] = set()
|
||||
raw_files = package_manifest.get("files", [])
|
||||
if raw_files is not None and not isinstance(raw_files, list):
|
||||
raise ValueError("assessment-package.json 'files' must be a list")
|
||||
for entry in raw_files or []:
|
||||
if isinstance(entry, str):
|
||||
paths.add(entry)
|
||||
elif isinstance(entry, dict) and isinstance(entry.get("path"), str):
|
||||
paths.add(entry["path"])
|
||||
else:
|
||||
raise ValueError(f"invalid assessment package file entry: {entry!r}")
|
||||
|
||||
raw_artifacts = package_manifest.get("artifact_manifest", [])
|
||||
if raw_artifacts is not None and not isinstance(raw_artifacts, list):
|
||||
raise ValueError("assessment-package.json 'artifact_manifest' must be a list")
|
||||
for entry in raw_artifacts or []:
|
||||
if isinstance(entry, dict) and isinstance(entry.get("path"), str):
|
||||
paths.add(entry["path"])
|
||||
else:
|
||||
raise ValueError(f"invalid assessment package artifact entry: {entry!r}")
|
||||
return paths
|
||||
|
||||
|
||||
def _retained_report_paths(retention_summary: dict[str, Any]) -> set[str]:
|
||||
paths: set[str] = set()
|
||||
for key in ("report_paths", "report_refs", "export_refs"):
|
||||
raw_paths = retention_summary.get(key, [])
|
||||
if not isinstance(raw_paths, list):
|
||||
continue
|
||||
paths.update(path for path in raw_paths if isinstance(path, str) and path)
|
||||
return paths
|
||||
|
||||
|
||||
def _source_commits(
|
||||
run_json: dict[str, Any],
|
||||
source_lock: dict[str, Any] | None,
|
||||
) -> dict[str, str]:
|
||||
raw = run_json.get("source_commits")
|
||||
if isinstance(raw, dict):
|
||||
return {str(key): str(value) for key, value in raw.items()}
|
||||
|
||||
commits: dict[str, str] = {}
|
||||
if source_lock is not None:
|
||||
for label, path in _source_paths(source_lock).items():
|
||||
commit = _git_head(path)
|
||||
if commit is not None:
|
||||
commits[label] = commit
|
||||
if commits:
|
||||
return commits
|
||||
|
||||
fingerprints = _source_fingerprints(source_lock)
|
||||
if fingerprints:
|
||||
return fingerprints
|
||||
|
||||
return {"unknown": "unrecorded-source"}
|
||||
|
||||
|
||||
def _source_paths(source_lock: dict[str, Any]) -> dict[str, Path]:
|
||||
paths: dict[str, Path] = {}
|
||||
profiles = source_lock.get("profiles", {})
|
||||
if isinstance(profiles, dict):
|
||||
for key, value in profiles.items():
|
||||
if isinstance(value, dict) and isinstance(value.get("path"), str):
|
||||
paths[f"profile:{key}"] = Path(value["path"])
|
||||
|
||||
extensions = source_lock.get("extensions", [])
|
||||
if isinstance(extensions, list):
|
||||
for entry in extensions:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
extension_id = str(entry.get("id") or "unknown-extension")
|
||||
raw_path = entry.get("path")
|
||||
if isinstance(raw_path, str) and Path(raw_path).is_absolute():
|
||||
paths[f"extension:{extension_id}"] = Path(raw_path)
|
||||
return paths
|
||||
|
||||
|
||||
def _git_head(path: Path) -> str | None:
|
||||
try:
|
||||
completed = subprocess.run(
|
||||
["git", "-C", str(path.parent if path.is_file() else path), "rev-parse", "HEAD"],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
except (OSError, subprocess.CalledProcessError, subprocess.TimeoutExpired):
|
||||
return None
|
||||
commit = completed.stdout.strip()
|
||||
return commit or None
|
||||
|
||||
|
||||
def _source_fingerprints(source_lock: dict[str, Any]) -> dict[str, str]:
|
||||
fingerprints: dict[str, str] = {}
|
||||
for key, value in source_lock.items():
|
||||
if key == "id" and isinstance(value, str):
|
||||
fingerprints["source_lock"] = value
|
||||
|
||||
profiles = source_lock.get("profiles", {})
|
||||
if isinstance(profiles, dict):
|
||||
for key, value in profiles.items():
|
||||
if isinstance(value, dict) and isinstance(value.get("checksum"), str):
|
||||
fingerprints[f"profile:{key}"] = value["checksum"]
|
||||
|
||||
extensions = source_lock.get("extensions", [])
|
||||
if isinstance(extensions, list):
|
||||
for entry in extensions:
|
||||
if isinstance(entry, dict) and isinstance(entry.get("manifest_checksum"), str):
|
||||
fingerprints[f"extension:{entry.get('id', 'unknown-extension')}"] = entry[
|
||||
"manifest_checksum"
|
||||
]
|
||||
return fingerprints
|
||||
|
||||
|
||||
def _evidence_counts(
|
||||
retention_summary: dict[str, Any],
|
||||
summary: dict[str, Any],
|
||||
) -> dict[str, int]:
|
||||
raw = retention_summary.get("evidence_counts")
|
||||
if isinstance(raw, dict):
|
||||
return _int_mapping(raw)
|
||||
raw_evidence = summary.get("evidence_results")
|
||||
if isinstance(raw_evidence, dict):
|
||||
return _int_mapping(raw_evidence)
|
||||
return {}
|
||||
|
||||
|
||||
def _finding_counts(
|
||||
retention_summary: dict[str, Any],
|
||||
summary: dict[str, Any],
|
||||
) -> dict[str, int]:
|
||||
raw = retention_summary.get("finding_counts")
|
||||
if isinstance(raw, dict):
|
||||
return _int_mapping(raw)
|
||||
keys = (
|
||||
"finding_count",
|
||||
"unexpected_findings",
|
||||
"expected_findings",
|
||||
"waived_findings",
|
||||
"challenged_findings",
|
||||
"authority_exclusions",
|
||||
"unresolved_defects",
|
||||
"unresolved_review_items",
|
||||
)
|
||||
return _int_mapping({key: summary[key] for key in keys if key in summary})
|
||||
|
||||
|
||||
def _int_mapping(raw: dict[str, Any]) -> dict[str, int]:
|
||||
return {
|
||||
str(key): int(value)
|
||||
for key, value in raw.items()
|
||||
if isinstance(value, int) and not isinstance(value, bool)
|
||||
}
|
||||
|
||||
|
||||
def _read_json(path: Path) -> dict[str, Any]:
|
||||
with path.open("r", encoding="utf-8") as fh:
|
||||
payload = json.load(fh)
|
||||
if not isinstance(payload, dict):
|
||||
raise ValueError(f"{path} must contain a JSON object")
|
||||
return payload
|
||||
|
||||
|
||||
def _read_json_if_exists(path: Path) -> dict[str, Any] | None:
|
||||
if not path.exists():
|
||||
return None
|
||||
return _read_json(path)
|
||||
|
||||
|
||||
async def _file_chunks(path: Path, chunk_size: int = 64 * 1024) -> AsyncIterator[bytes]:
|
||||
with path.open("rb") as fh:
|
||||
while True:
|
||||
chunk = fh.read(chunk_size)
|
||||
if not chunk:
|
||||
break
|
||||
yield chunk
|
||||
@@ -33,6 +33,7 @@ from artifactstore.dataplane.spi import DataPlane, IngestHints
|
||||
from artifactstore.db.schema import (
|
||||
artifact_files,
|
||||
artifact_packages,
|
||||
metadata_schemas,
|
||||
retention_classes,
|
||||
retention_state,
|
||||
storage_locations,
|
||||
@@ -70,6 +71,7 @@ __all__ = [
|
||||
"FileNotFoundError",
|
||||
"FileRecord",
|
||||
"IllegalPackageStateError",
|
||||
"MetadataSchemaRecord",
|
||||
"PackageNotFoundError",
|
||||
"PackageRecord",
|
||||
"Registry",
|
||||
@@ -100,6 +102,16 @@ class RetentionStateError(ValueError):
|
||||
"""Raised when a retention lifecycle operation is invalid."""
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class MetadataSchemaRecord:
|
||||
"""Registered package metadata schema."""
|
||||
|
||||
id: UUID
|
||||
slug: str
|
||||
json_schema: dict[str, Any]
|
||||
created_at: datetime | None
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class PackageRecord:
|
||||
"""Materialised package row projected into the registry API."""
|
||||
@@ -208,9 +220,15 @@ class Registry:
|
||||
retention_class: str,
|
||||
actor: str,
|
||||
metadata: dict[str, Any] | None = None,
|
||||
metadata_schema_slug: str | None = None,
|
||||
) -> UUID:
|
||||
"""Create a new package; returns its ``UUID``."""
|
||||
retention_class_row = await self._get_retention_class(retention_class)
|
||||
package_metadata = metadata or {}
|
||||
metadata_schema_id = await self._validate_metadata_schema(
|
||||
metadata_schema_slug,
|
||||
package_metadata,
|
||||
)
|
||||
package_id = uuid.uuid4()
|
||||
payload = cbor2.dumps(
|
||||
{
|
||||
@@ -218,7 +236,8 @@ class Registry:
|
||||
"producer": producer,
|
||||
"subject": subject,
|
||||
"retention_class": retention_class,
|
||||
"metadata": metadata or {},
|
||||
"metadata": package_metadata,
|
||||
"metadata_schema_id": str(metadata_schema_id) if metadata_schema_id else None,
|
||||
},
|
||||
canonical=True,
|
||||
)
|
||||
@@ -513,6 +532,48 @@ class Registry:
|
||||
for r in rows
|
||||
]
|
||||
|
||||
async def register_metadata_schema(
|
||||
self,
|
||||
*,
|
||||
slug: str,
|
||||
json_schema: dict[str, Any],
|
||||
) -> UUID:
|
||||
"""Register a package metadata JSON Schema, idempotent by slug."""
|
||||
schema_id = uuid.uuid4()
|
||||
async with self._engine.begin() as conn:
|
||||
existing = (
|
||||
await conn.execute(
|
||||
select(metadata_schemas.c.id).where(metadata_schemas.c.slug == slug)
|
||||
)
|
||||
).first()
|
||||
if existing is not None:
|
||||
return UUID(str(existing.id))
|
||||
await conn.execute(
|
||||
metadata_schemas.insert().values(
|
||||
id=schema_id,
|
||||
slug=slug,
|
||||
json_schema=json_schema,
|
||||
)
|
||||
)
|
||||
return schema_id
|
||||
|
||||
async def get_metadata_schema(self, slug: str) -> MetadataSchemaRecord:
|
||||
"""Return one registered metadata schema by slug."""
|
||||
async with self._engine.connect() as conn:
|
||||
row = (
|
||||
await conn.execute(
|
||||
select(metadata_schemas).where(metadata_schemas.c.slug == slug)
|
||||
)
|
||||
).first()
|
||||
if row is None:
|
||||
raise KeyError(f"metadata schema not found: {slug}")
|
||||
return MetadataSchemaRecord(
|
||||
id=row.id,
|
||||
slug=row.slug,
|
||||
json_schema=dict(row.json_schema),
|
||||
created_at=row.created_at,
|
||||
)
|
||||
|
||||
async def get_retention_state(self, package_id: UUID) -> RetentionStateRecord:
|
||||
"""Return the retention materialised view for one package."""
|
||||
async with self._engine.connect() as conn:
|
||||
@@ -902,6 +963,25 @@ class Registry:
|
||||
deletion_strategy=row.deletion_strategy,
|
||||
)
|
||||
|
||||
async def _validate_metadata_schema(
|
||||
self,
|
||||
slug: str | None,
|
||||
metadata: dict[str, Any],
|
||||
) -> UUID | None:
|
||||
if slug is None:
|
||||
return None
|
||||
try:
|
||||
schema = await self.get_metadata_schema(slug)
|
||||
except KeyError as exc:
|
||||
raise ValueError(str(exc)) from exc
|
||||
required = schema.json_schema.get("required", [])
|
||||
if not isinstance(required, list):
|
||||
raise ValueError(f"metadata schema {slug!r} has invalid required list")
|
||||
missing = [key for key in required if isinstance(key, str) and key not in metadata]
|
||||
if missing:
|
||||
raise ValueError(f"metadata missing required schema keys: {', '.join(missing)}")
|
||||
return schema.id
|
||||
|
||||
|
||||
def _iso(value: datetime | None) -> str | None:
|
||||
if value is None:
|
||||
|
||||
Reference in New Issue
Block a user