artifact refs and manifest fingerprinting

2026-05-07 13:11:29 +02:00
parent 12ab9c88cb
commit 0b90004a6e
8 changed files with 177 additions and 8 deletions
--- a/docs/ARCHITECTURE-BLUEPRINT.md
+++ b/docs/ARCHITECTURE-BLUEPRINT.md
@@ -337,6 +337,10 @@ Stores run artifacts by reference and checksum:
 - profile snapshots,
 - source lockfiles.

+The first implementation builds the assessment package artifact manifest from
+runner-emitted artifact refs and computes checksums for files inside the run
+directory.
+
 ### Normalizer

 Converts extension output into guide-board evidence records.
--- a/docs/EXTENSION-SDK.md
+++ b/docs/EXTENSION-SDK.md
@@ -135,6 +135,10 @@ Result fields:
 - `facts`: structured facts extracted by the runner.
 - `artifact_refs`: references to raw artifacts written by the runner.

+Artifact refs must be paths relative to the run directory. After runner
+execution, the core fingerprints existing artifact refs into the assessment
+package `artifact_manifest`.
+
 If a Python runner raises an exception, the core converts that failure into
 `infrastructure_error` evidence so the assessment package remains complete.

@@ -163,7 +167,6 @@ Initial statuses:

 ## Next SDK Steps

- Add artifact helper APIs for extension-generated raw files.
 - Add normalizer and mapping plug-in contracts.
 - Add extension-owned schema validation for domain-specific target profile
  fields.
--- a/extensions/open-cmis-tck/src/open_cmis_tck/preflight.py
+++ b/extensions/open-cmis-tck/src/open_cmis_tck/preflight.py
@@ -3,6 +3,7 @@
 from __future__ import annotations

 import json
+from pathlib import Path
 from typing import Any
 from urllib.error import HTTPError, URLError
 from urllib.request import Request, urlopen
@@ -24,6 +25,7 @@ def run(context: dict[str, Any]) -> dict[str, Any]:
        }

    timeout = _timeout_seconds(context)
+    artifact_refs: list[str] = []
    request = Request(
        endpoint["url"],
        headers={
@@ -35,8 +37,25 @@ def run(context: dict[str, Any]) -> dict[str, Any]:
        with urlopen(request, timeout=timeout) as response:
            status_code = response.status
            content_type = response.headers.get("Content-Type", "")
+            headers = dict(response.headers.items())
            body = response.read(1024 * 1024)
+            artifact_refs = _write_response_artifacts(
+                context,
+                status_code,
+                content_type,
+                headers,
+                body,
+            )
    except HTTPError as exc:
+        body = exc.read(1024 * 1024)
+        content_type = exc.headers.get("Content-Type", "")
+        artifact_refs = _write_response_artifacts(
+            context,
+            exc.code,
+            content_type,
+            dict(exc.headers.items()),
+            body,
+        )
        return {
            "result": "infrastructure_error",
            "observations": [
@@ -46,8 +65,9 @@ def run(context: dict[str, Any]) -> dict[str, Any]:
                "endpoint_found": True,
                "url": endpoint["url"],
                "http_status": exc.code,
+                "content_type": content_type,
            },
-            "artifact_refs": [],
+            "artifact_refs": artifact_refs,
        }
    except URLError as exc:
        return {
@@ -60,7 +80,7 @@ def run(context: dict[str, Any]) -> dict[str, Any]:
                "url": endpoint["url"],
                "error": str(exc.reason),
            },
-            "artifact_refs": [],
+            "artifact_refs": artifact_refs,
        }
    except TimeoutError:
        return {
@@ -73,7 +93,7 @@ def run(context: dict[str, Any]) -> dict[str, Any]:
                "url": endpoint["url"],
                "timeout_seconds": timeout,
            },
-            "artifact_refs": [],
+            "artifact_refs": artifact_refs,
        }

    facts: dict[str, Any] = {
@@ -93,7 +113,7 @@ def run(context: dict[str, Any]) -> dict[str, Any]:
                "CMIS Browser Binding endpoint is reachable but did not return parseable JSON."
            ],
            "facts": facts,
-            "artifact_refs": [],
+            "artifact_refs": artifact_refs,
        }

    facts["json_detected"] = True
@@ -104,7 +124,7 @@ def run(context: dict[str, Any]) -> dict[str, Any]:
            "CMIS Browser Binding endpoint is reachable and returned parseable JSON."
        ],
        "facts": facts,
-        "artifact_refs": [],
+        "artifact_refs": artifact_refs,
    }


@@ -159,3 +179,35 @@ def _repository_facts(value: Any) -> dict[str, Any]:
        "repository_shape": "object",
        "top_level_keys": sorted(str(key) for key in value.keys())[:20],
    }
+
+
+def _write_response_artifacts(
+    context: dict[str, Any],
+    status_code: int,
+    content_type: str,
+    headers: dict[str, str],
+    body: bytes,
+) -> list[str]:
+    run_dir = Path(context["run_dir"])
+    artifact_dir = run_dir / "artifacts" / "open-cmis-tck" / "preflight"
+    artifact_dir.mkdir(parents=True, exist_ok=True)
+
+    response_ref = "artifacts/open-cmis-tck/preflight/response-body.bin"
+    metadata_ref = "artifacts/open-cmis-tck/preflight/response-metadata.json"
+
+    (run_dir / response_ref).write_bytes(body)
+    (run_dir / metadata_ref).write_text(
+        json.dumps(
+            {
+                "status_code": status_code,
+                "content_type": content_type,
+                "headers": headers,
+                "byte_count": len(body),
+            },
+            indent=2,
+            sort_keys=True,
+        )
+        + "\n",
+        encoding="utf-8",
+    )
+    return [metadata_ref, response_ref]
--- a/extensions/open-cmis-tck/workplans/OPEN-CMIS-TCK-WP-0001-harness-foundation.md
+++ b/extensions/open-cmis-tck/workplans/OPEN-CMIS-TCK-WP-0001-harness-foundation.md
@@ -107,6 +107,8 @@ Progress:

 - The first CMIS Browser Binding preflight runner checks endpoint reachability
  and parseable JSON repository metadata through the guide-board runner bridge.
+- The preflight runner preserves raw response metadata and body artifacts for
+  assessment-package fingerprinting.
 - Capability flag normalization remains to be expanded after a live target sample
  is captured.

--- a/src/guide_board/artifacts.py
+++ b/src/guide_board/artifacts.py
@@ -0,0 +1,65 @@
+"""Artifact manifest helpers."""
+
+from __future__ import annotations
+
+import hashlib
+import mimetypes
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+from guide_board.schema import assert_valid
+
+
+def build_artifact_manifest(
+    run_dir: Path,
+    run_id: str,
+    evidence: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    artifacts: list[dict[str, Any]] = []
+    seen: set[str] = set()
+    for item in evidence:
+        producer = item["check_id"]
+        for artifact_ref in item.get("artifact_refs", []):
+            if not isinstance(artifact_ref, str) or artifact_ref in seen:
+                continue
+            seen.add(artifact_ref)
+            path = (run_dir / artifact_ref).resolve()
+            try:
+                path.relative_to(run_dir.resolve())
+            except ValueError:
+                continue
+            if not path.exists() or not path.is_file():
+                continue
+            artifact = {
+                "id": f"artifact:{_safe_id(artifact_ref)}",
+                "run_id": run_id,
+                "path": artifact_ref,
+                "media_type": _media_type(path),
+                "producer": producer,
+                "checksum": f"sha256:{_sha256(path)}",
+                "created_at": datetime.now(timezone.utc).isoformat(),
+                "retention_class": "raw",
+            }
+            assert_valid(artifact, "raw-artifact")
+            artifacts.append(artifact)
+    return artifacts
+
+
+def _sha256(path: Path) -> str:
+    digest = hashlib.sha256()
+    with path.open("rb") as handle:
+        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+            digest.update(chunk)
+    return digest.hexdigest()
+
+
+def _media_type(path: Path) -> str:
+    guessed, _ = mimetypes.guess_type(path.name)
+    if guessed:
+        return guessed
+    return "application/octet-stream"
+
+
+def _safe_id(value: str) -> str:
+    return "".join(char if char.isalnum() or char in {"-", "_"} else "_" for char in value)
--- a/src/guide_board/execution.py
+++ b/src/guide_board/execution.py
@@ -7,6 +7,7 @@ from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any

+from guide_board.artifacts import build_artifact_manifest
 from guide_board.io import write_json
 from guide_board.planning import build_run_plan
 from guide_board.runners import run_step
@@ -35,7 +36,16 @@ def run_assessment(
    for finding in findings:
        assert_valid(finding, "finding")

-    assessment_package = _assessment_package(run_id, plan, evidence, findings, created_at)
+    artifact_manifest = build_artifact_manifest(run_dir, run_id, evidence)
+
+    assessment_package = _assessment_package(
+        run_id,
+        plan,
+        evidence,
+        findings,
+        artifact_manifest,
+        created_at,
+    )
    assert_valid(assessment_package, "assessment-package")

    run_metadata = {
@@ -164,6 +174,7 @@ def _assessment_package(
    plan: dict[str, Any],
    evidence: list[dict[str, Any]],
    findings: list[dict[str, Any]],
+    artifact_manifest: list[dict[str, Any]],
    created_at: str,
 ) -> dict[str, Any]:
    summary = dict(Counter(item["result"] for item in evidence))
@@ -179,7 +190,7 @@ def _assessment_package(
        "summary": summary,
        "findings": findings,
        "evidence_refs": [item["id"] for item in evidence],
-        "artifact_manifest": [],
+        "artifact_manifest": artifact_manifest,
        "waivers": [],
        "certification_boundary": "Guide Board produces preparation evidence only and does not issue certifications or audit assurance.",
        "created_at": created_at,
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -153,13 +153,35 @@ class CoreArchitectureTests(unittest.TestCase):
                        encoding="utf-8"
                    )
                )
+                package = json.loads(
+                    (Path(result["run_dir"]) / "reports" / "assessment-package.json").read_text(
+                        encoding="utf-8"
+                    )
+                )

                self.assertEqual(result["status"], "completed")
                self.assertEqual(evidence["evidence"][0]["result"], "pass")
+                self.assertEqual(
+                    sorted(evidence["evidence"][0]["artifact_refs"]),
+                    [
+                        "artifacts/open-cmis-tck/preflight/response-body.bin",
+                        "artifacts/open-cmis-tck/preflight/response-metadata.json",
+                    ],
+                )
                self.assertEqual(
                    evidence["evidence"][0]["facts"]["repository_ids"],
                    ["local-test-repository"],
                )
+                self.assertEqual(len(package["artifact_manifest"]), 2)
+                self.assertTrue(
+                    (
+                        Path(result["run_dir"])
+                        / "artifacts"
+                        / "open-cmis-tck"
+                        / "preflight"
+                        / "response-metadata.json"
+                    ).exists()
+                )
        finally:
            server.shutdown()
            thread.join(timeout=5)
@@ -243,6 +265,11 @@ class CoreArchitectureTests(unittest.TestCase):
                        encoding="utf-8"
                    )
                )["findings"]
+                package = json.loads(
+                    (Path(result["run_dir"]) / "reports" / "assessment-package.json").read_text(
+                        encoding="utf-8"
+                    )
+                )

                self.assertEqual(result["status"], "blocked")
                self.assertEqual(evidence[0]["result"], "pass")
@@ -256,6 +283,7 @@ class CoreArchitectureTests(unittest.TestCase):
                    findings[0]["classification"],
                    evidence[1]["facts"]["blocked_reason"],
                )
+                self.assertGreaterEqual(len(package["artifact_manifest"]), 3)
        finally:
            server.shutdown()
            thread.join(timeout=5)
--- a/workplans/GUIDE-BOARD-WP-0001-bootstrapping.md
+++ b/workplans/GUIDE-BOARD-WP-0001-bootstrapping.md
@@ -184,6 +184,8 @@ Acceptance:
  of CMIS.
 - The baseline executor writes the run directory contract, normalized evidence,
  an assessment package, and a Markdown report.
+- The assessment package includes a fingerprinted artifact manifest for
+  runner-emitted raw artifacts.

 ## D1.7 - Extension SDK Skeleton

@@ -204,6 +206,8 @@ Acceptance:
 - Python module runner contracts are documented in `docs/EXTENSION-SDK.md`.
 - Manifest-declared command runners execute without shell expansion and return
  normalized evidence through the same runner result contract.
+- Runner artifact refs are constrained to the run directory and fingerprinted in
+  the assessment package artifact manifest.

 ## D1.8 - CMIS Seed Extension Integration