From 0b90004a6e708360352f040d8197988508add6fd Mon Sep 17 00:00:00 2001
From: tegwick <bernd.worsch@gmail.com>
Date: Thu, 7 May 2026 13:11:29 +0200
Subject: [PATCH] artifact refs and manifest fingerprinting

---
 docs/ARCHITECTURE-BLUEPRINT.md                |  4 ++
 docs/EXTENSION-SDK.md                         |  5 +-
 .../src/open_cmis_tck/preflight.py            | 62 ++++++++++++++++--
 ...PEN-CMIS-TCK-WP-0001-harness-foundation.md |  2 +
 src/guide_board/artifacts.py                  | 65 +++++++++++++++++++
 src/guide_board/execution.py                  | 15 ++++-
 tests/test_core.py                            | 28 ++++++++
 .../GUIDE-BOARD-WP-0001-bootstrapping.md      |  4 ++
 8 files changed, 177 insertions(+), 8 deletions(-)
 create mode 100644 src/guide_board/artifacts.py

diff --git a/docs/ARCHITECTURE-BLUEPRINT.md b/docs/ARCHITECTURE-BLUEPRINT.md
index b459c44..2e8d2c9 100644
--- a/docs/ARCHITECTURE-BLUEPRINT.md
+++ b/docs/ARCHITECTURE-BLUEPRINT.md
@@ -337,6 +337,10 @@ Stores run artifacts by reference and checksum:
 - profile snapshots,
 - source lockfiles.
 
+The first implementation builds the assessment package artifact manifest from
+runner-emitted artifact refs and computes checksums for files inside the run
+directory.
+
 ### Normalizer
 
 Converts extension output into guide-board evidence records.
diff --git a/docs/EXTENSION-SDK.md b/docs/EXTENSION-SDK.md
index d65b15a..997e601 100644
--- a/docs/EXTENSION-SDK.md
+++ b/docs/EXTENSION-SDK.md
@@ -135,6 +135,10 @@ Result fields:
 - `facts`: structured facts extracted by the runner.
 - `artifact_refs`: references to raw artifacts written by the runner.
 
+Artifact refs must be paths relative to the run directory. After runner
+execution, the core fingerprints existing artifact refs into the assessment
+package `artifact_manifest`.
+
 If a Python runner raises an exception, the core converts that failure into
 `infrastructure_error` evidence so the assessment package remains complete.
 
@@ -163,7 +167,6 @@ Initial statuses:
 
 ## Next SDK Steps
 
-- Add artifact helper APIs for extension-generated raw files.
 - Add normalizer and mapping plug-in contracts.
 - Add extension-owned schema validation for domain-specific target profile
   fields.
diff --git a/extensions/open-cmis-tck/src/open_cmis_tck/preflight.py b/extensions/open-cmis-tck/src/open_cmis_tck/preflight.py
index 39fba96..5029b0c 100644
--- a/extensions/open-cmis-tck/src/open_cmis_tck/preflight.py
+++ b/extensions/open-cmis-tck/src/open_cmis_tck/preflight.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import json
+from pathlib import Path
 from typing import Any
 from urllib.error import HTTPError, URLError
 from urllib.request import Request, urlopen
@@ -24,6 +25,7 @@ def run(context: dict[str, Any]) -> dict[str, Any]:
         }
 
     timeout = _timeout_seconds(context)
+    artifact_refs: list[str] = []
     request = Request(
         endpoint["url"],
         headers={
@@ -35,8 +37,25 @@ def run(context: dict[str, Any]) -> dict[str, Any]:
         with urlopen(request, timeout=timeout) as response:
             status_code = response.status
             content_type = response.headers.get("Content-Type", "")
+            headers = dict(response.headers.items())
             body = response.read(1024 * 1024)
+            artifact_refs = _write_response_artifacts(
+                context,
+                status_code,
+                content_type,
+                headers,
+                body,
+            )
     except HTTPError as exc:
+        body = exc.read(1024 * 1024)
+        content_type = exc.headers.get("Content-Type", "")
+        artifact_refs = _write_response_artifacts(
+            context,
+            exc.code,
+            content_type,
+            dict(exc.headers.items()),
+            body,
+        )
         return {
             "result": "infrastructure_error",
             "observations": [
@@ -46,8 +65,9 @@ def run(context: dict[str, Any]) -> dict[str, Any]:
                 "endpoint_found": True,
                 "url": endpoint["url"],
                 "http_status": exc.code,
+                "content_type": content_type,
             },
-            "artifact_refs": [],
+            "artifact_refs": artifact_refs,
         }
     except URLError as exc:
         return {
@@ -60,7 +80,7 @@ def run(context: dict[str, Any]) -> dict[str, Any]:
                 "url": endpoint["url"],
                 "error": str(exc.reason),
             },
-            "artifact_refs": [],
+            "artifact_refs": artifact_refs,
         }
     except TimeoutError:
         return {
@@ -73,7 +93,7 @@ def run(context: dict[str, Any]) -> dict[str, Any]:
                 "url": endpoint["url"],
                 "timeout_seconds": timeout,
             },
-            "artifact_refs": [],
+            "artifact_refs": artifact_refs,
         }
 
     facts: dict[str, Any] = {
@@ -93,7 +113,7 @@ def run(context: dict[str, Any]) -> dict[str, Any]:
                 "CMIS Browser Binding endpoint is reachable but did not return parseable JSON."
             ],
             "facts": facts,
-            "artifact_refs": [],
+            "artifact_refs": artifact_refs,
         }
 
     facts["json_detected"] = True
@@ -104,7 +124,7 @@ def run(context: dict[str, Any]) -> dict[str, Any]:
             "CMIS Browser Binding endpoint is reachable and returned parseable JSON."
         ],
         "facts": facts,
-        "artifact_refs": [],
+        "artifact_refs": artifact_refs,
     }
 
 
@@ -159,3 +179,35 @@ def _repository_facts(value: Any) -> dict[str, Any]:
         "repository_shape": "object",
         "top_level_keys": sorted(str(key) for key in value.keys())[:20],
     }
+
+
+def _write_response_artifacts(
+    context: dict[str, Any],
+    status_code: int,
+    content_type: str,
+    headers: dict[str, str],
+    body: bytes,
+) -> list[str]:
+    run_dir = Path(context["run_dir"])
+    artifact_dir = run_dir / "artifacts" / "open-cmis-tck" / "preflight"
+    artifact_dir.mkdir(parents=True, exist_ok=True)
+
+    response_ref = "artifacts/open-cmis-tck/preflight/response-body.bin"
+    metadata_ref = "artifacts/open-cmis-tck/preflight/response-metadata.json"
+
+    (run_dir / response_ref).write_bytes(body)
+    (run_dir / metadata_ref).write_text(
+        json.dumps(
+            {
+                "status_code": status_code,
+                "content_type": content_type,
+                "headers": headers,
+                "byte_count": len(body),
+            },
+            indent=2,
+            sort_keys=True,
+        )
+        + "\n",
+        encoding="utf-8",
+    )
+    return [metadata_ref, response_ref]
diff --git a/extensions/open-cmis-tck/workplans/OPEN-CMIS-TCK-WP-0001-harness-foundation.md b/extensions/open-cmis-tck/workplans/OPEN-CMIS-TCK-WP-0001-harness-foundation.md
index 9052d3e..c71e4ad 100644
--- a/extensions/open-cmis-tck/workplans/OPEN-CMIS-TCK-WP-0001-harness-foundation.md
+++ b/extensions/open-cmis-tck/workplans/OPEN-CMIS-TCK-WP-0001-harness-foundation.md
@@ -107,6 +107,8 @@ Progress:
 
 - The first CMIS Browser Binding preflight runner checks endpoint reachability
   and parseable JSON repository metadata through the guide-board runner bridge.
+- The preflight runner preserves raw response metadata and body artifacts for
+  assessment-package fingerprinting.
 - Capability flag normalization remains to be expanded after a live target sample
   is captured.
 
diff --git a/src/guide_board/artifacts.py b/src/guide_board/artifacts.py
new file mode 100644
index 0000000..b91193d
--- /dev/null
+++ b/src/guide_board/artifacts.py
@@ -0,0 +1,65 @@
+"""Artifact manifest helpers."""
+
+from __future__ import annotations
+
+import hashlib
+import mimetypes
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+from guide_board.schema import assert_valid
+
+
+def build_artifact_manifest(
+    run_dir: Path,
+    run_id: str,
+    evidence: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    artifacts: list[dict[str, Any]] = []
+    seen: set[str] = set()
+    for item in evidence:
+        producer = item["check_id"]
+        for artifact_ref in item.get("artifact_refs", []):
+            if not isinstance(artifact_ref, str) or artifact_ref in seen:
+                continue
+            seen.add(artifact_ref)
+            path = (run_dir / artifact_ref).resolve()
+            try:
+                path.relative_to(run_dir.resolve())
+            except ValueError:
+                continue
+            if not path.exists() or not path.is_file():
+                continue
+            artifact = {
+                "id": f"artifact:{_safe_id(artifact_ref)}",
+                "run_id": run_id,
+                "path": artifact_ref,
+                "media_type": _media_type(path),
+                "producer": producer,
+                "checksum": f"sha256:{_sha256(path)}",
+                "created_at": datetime.now(timezone.utc).isoformat(),
+                "retention_class": "raw",
+            }
+            assert_valid(artifact, "raw-artifact")
+            artifacts.append(artifact)
+    return artifacts
+
+
+def _sha256(path: Path) -> str:
+    digest = hashlib.sha256()
+    with path.open("rb") as handle:
+        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+            digest.update(chunk)
+    return digest.hexdigest()
+
+
+def _media_type(path: Path) -> str:
+    guessed, _ = mimetypes.guess_type(path.name)
+    if guessed:
+        return guessed
+    return "application/octet-stream"
+
+
+def _safe_id(value: str) -> str:
+    return "".join(char if char.isalnum() or char in {"-", "_"} else "_" for char in value)
diff --git a/src/guide_board/execution.py b/src/guide_board/execution.py
index 31949f5..911679f 100644
--- a/src/guide_board/execution.py
+++ b/src/guide_board/execution.py
@@ -7,6 +7,7 @@ from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any
 
+from guide_board.artifacts import build_artifact_manifest
 from guide_board.io import write_json
 from guide_board.planning import build_run_plan
 from guide_board.runners import run_step
@@ -35,7 +36,16 @@ def run_assessment(
     for finding in findings:
         assert_valid(finding, "finding")
 
-    assessment_package = _assessment_package(run_id, plan, evidence, findings, created_at)
+    artifact_manifest = build_artifact_manifest(run_dir, run_id, evidence)
+
+    assessment_package = _assessment_package(
+        run_id,
+        plan,
+        evidence,
+        findings,
+        artifact_manifest,
+        created_at,
+    )
     assert_valid(assessment_package, "assessment-package")
 
     run_metadata = {
@@ -164,6 +174,7 @@ def _assessment_package(
     plan: dict[str, Any],
     evidence: list[dict[str, Any]],
     findings: list[dict[str, Any]],
+    artifact_manifest: list[dict[str, Any]],
     created_at: str,
 ) -> dict[str, Any]:
     summary = dict(Counter(item["result"] for item in evidence))
@@ -179,7 +190,7 @@ def _assessment_package(
         "summary": summary,
         "findings": findings,
         "evidence_refs": [item["id"] for item in evidence],
-        "artifact_manifest": [],
+        "artifact_manifest": artifact_manifest,
         "waivers": [],
         "certification_boundary": "Guide Board produces preparation evidence only and does not issue certifications or audit assurance.",
         "created_at": created_at,
diff --git a/tests/test_core.py b/tests/test_core.py
index 71644ae..c57b861 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -153,13 +153,35 @@ class CoreArchitectureTests(unittest.TestCase):
                         encoding="utf-8"
                     )
                 )
+                package = json.loads(
+                    (Path(result["run_dir"]) / "reports" / "assessment-package.json").read_text(
+                        encoding="utf-8"
+                    )
+                )
 
                 self.assertEqual(result["status"], "completed")
                 self.assertEqual(evidence["evidence"][0]["result"], "pass")
+                self.assertEqual(
+                    sorted(evidence["evidence"][0]["artifact_refs"]),
+                    [
+                        "artifacts/open-cmis-tck/preflight/response-body.bin",
+                        "artifacts/open-cmis-tck/preflight/response-metadata.json",
+                    ],
+                )
                 self.assertEqual(
                     evidence["evidence"][0]["facts"]["repository_ids"],
                     ["local-test-repository"],
                 )
+                self.assertEqual(len(package["artifact_manifest"]), 2)
+                self.assertTrue(
+                    (
+                        Path(result["run_dir"])
+                        / "artifacts"
+                        / "open-cmis-tck"
+                        / "preflight"
+                        / "response-metadata.json"
+                    ).exists()
+                )
         finally:
             server.shutdown()
             thread.join(timeout=5)
@@ -243,6 +265,11 @@ class CoreArchitectureTests(unittest.TestCase):
                         encoding="utf-8"
                     )
                 )["findings"]
+                package = json.loads(
+                    (Path(result["run_dir"]) / "reports" / "assessment-package.json").read_text(
+                        encoding="utf-8"
+                    )
+                )
 
                 self.assertEqual(result["status"], "blocked")
                 self.assertEqual(evidence[0]["result"], "pass")
@@ -256,6 +283,7 @@ class CoreArchitectureTests(unittest.TestCase):
                     findings[0]["classification"],
                     evidence[1]["facts"]["blocked_reason"],
                 )
+                self.assertGreaterEqual(len(package["artifact_manifest"]), 3)
         finally:
             server.shutdown()
             thread.join(timeout=5)
diff --git a/workplans/GUIDE-BOARD-WP-0001-bootstrapping.md b/workplans/GUIDE-BOARD-WP-0001-bootstrapping.md
index 35c94e9..65bd865 100644
--- a/workplans/GUIDE-BOARD-WP-0001-bootstrapping.md
+++ b/workplans/GUIDE-BOARD-WP-0001-bootstrapping.md
@@ -184,6 +184,8 @@ Acceptance:
   of CMIS.
 - The baseline executor writes the run directory contract, normalized evidence,
   an assessment package, and a Markdown report.
+- The assessment package includes a fingerprinted artifact manifest for
+  runner-emitted raw artifacts.
 
 ## D1.7 - Extension SDK Skeleton
 
@@ -204,6 +206,8 @@ Acceptance:
 - Python module runner contracts are documented in `docs/EXTENSION-SDK.md`.
 - Manifest-declared command runners execute without shell expansion and return
   normalized evidence through the same runner result contract.
+- Runner artifact refs are constrained to the run directory and fingerprinted in
+  the assessment package artifact manifest.
 
 ## D1.8 - CMIS Seed Extension Integration