From 24f4c09d428e8be95ebb113b2d8a4ef7eb67bff6 Mon Sep 17 00:00:00 2001
From: tegwick <bernd.worsch@gmail.com>
Date: Wed, 3 Jun 2026 11:56:21 +0200
Subject: [PATCH] Implement llm-connect ADHOC diagnostics

---
 ARCHITECTURE-LAYERS.md                |   3 +
 README.md                             |  38 +-
 docs/adapter-model-params.md          | 102 +++++
 llm_connect/_diagnostics.py           | 153 +++++++
 llm_connect/_http.py                  | 187 +++++----
 llm_connect/_payload.py               | 154 +++++++
 llm_connect/claude_code.py            | 566 +++++++++++++-------------
 llm_connect/gemini.py                 |   3 +
 llm_connect/openai.py                 |   3 +
 llm_connect/openrouter.py             | 372 +++++++----------
 llm_connect/replay.py                 | 121 ++++++
 llm_connect/server.py                 |  85 +++-
 tests/test_payload.py                 |  81 ++++
 tests/test_replay.py                  |  62 +++
 tests/test_server.py                  | 123 +++++-
 tests/test_structured_output_smoke.py | 142 +++++++
 workplans/ADHOC-2026-06-02.md         |  34 +-
 17 files changed, 1618 insertions(+), 611 deletions(-)
 create mode 100644 docs/adapter-model-params.md
 create mode 100644 llm_connect/_diagnostics.py
 create mode 100644 llm_connect/_payload.py
 create mode 100644 llm_connect/replay.py
 create mode 100644 tests/test_payload.py
 create mode 100644 tests/test_replay.py
 create mode 100644 tests/test_structured_output_smoke.py

diff --git a/ARCHITECTURE-LAYERS.md b/ARCHITECTURE-LAYERS.md
index 1c6a3ee..c954236 100644
--- a/ARCHITECTURE-LAYERS.md
+++ b/ARCHITECTURE-LAYERS.md
@@ -32,6 +32,9 @@ Maturity states: **Experimental → Beta → Stable → Deprecated**
 | `gemini.py` | `GeminiAdapter` — Google Generative Language API | Beta |
 | `openrouter.py` | `OpenRouterAdapter` — OpenAI-compatible multi-model routing | Beta |
 | `claude_code.py` | `ClaudeCodeAdapter` — `claude --print` subprocess | Beta |
+| `_payload.py` | Shared adapter payload translation for `RunConfig.model_params` | Beta |
+| `_diagnostics.py` | Opt-in per-call diagnostics capture for server debug and audit modes | Beta |
+| `replay.py` | Audit replay parser CLI (`python -m llm_connect.replay`) | Beta |
 | `embedding_adapter.py` | `EmbeddingAdapter` ABC | Beta |
 | `embedding_openai.py` | `OpenAICompatibleEmbeddingAdapter` | Beta |
 | `embedding_cache.py` | `EmbeddingCache` — disk-backed embedding cache | Beta |
diff --git a/README.md b/README.md
index 1ac1514..0fd8690 100644
--- a/README.md
+++ b/README.md
@@ -73,15 +73,15 @@ config = RunConfig(
 )
 ```
 
-| Field | Default | Description |
-|---|---|---|
-| `model_name` | `"gpt-4"` | Model identifier (adapter may override) |
-| `temperature` | `0.7` | Sampling temperature |
-| `max_tokens` | `2000` | Maximum output tokens |
-| `model_params` | `{}` | Extra provider-specific parameters |
-| `max_depth` | `3` | Max nesting depth for recursive calls |
-| `skip_if_exists` | `True` | Skip if identical input hash already processed |
-| `timeout_seconds` | `300` | Request timeout |
+| Field | Default | Description |
+|---|---|---|
+| `model_name` | `"gpt-4"` | Model identifier (adapter may override) |
+| `temperature` | `0.7` | Sampling temperature |
+| `max_tokens` | `2000` | Maximum output tokens |
+| `model_params` | `{}` | Portable extras translated by each adapter; see `docs/adapter-model-params.md` |
+| `max_depth` | `3` | Max nesting depth for recursive calls |
+| `skip_if_exists` | `True` | Skip if identical input hash already processed |
+| `timeout_seconds` | `300` | Request timeout |
 
 ### `LLMResponse`
 
@@ -92,8 +92,24 @@ response = adapter.execute_prompt(prompt, config)
 print(response.content)       # generated text
 print(response.model)         # model actually used
 print(response.usage)         # {"prompt_tokens": …, "completion_tokens": …, "total_tokens": …}
-print(response.finish_reason) # "stop", "length", etc.
-```
+print(response.finish_reason) # "stop", "length", etc.
+```
+
+## Server diagnostics
+
+Serve mode can include a debug envelope without changing normal responses:
+
+```bash
+LLM_CONNECT_DEBUG=1 python -m llm_connect.server --provider openrouter
+curl 'http://127.0.0.1:8080/execute?debug=1' -d '{"prompt":"hi"}'
+```
+
+Set `LLM_CONNECT_AUDIT_DIR=/path/to/audit` to write per-call replay records,
+then parse one without another provider call:
+
+```bash
+python -m llm_connect.replay /path/to/audit/record.json --json
+```
 
 ## Writing your own adapter
 
diff --git a/docs/adapter-model-params.md b/docs/adapter-model-params.md
new file mode 100644
index 0000000..61e8ee3
--- /dev/null
+++ b/docs/adapter-model-params.md
@@ -0,0 +1,102 @@
+# Adapter `model_params` contract
+
+`RunConfig.model_params` is a portability layer, not a blind provider payload
+escape hatch. Adapters must translate the shared keys they understand, pass
+through only provider-valid keys, and drop provider-specific keys that would
+make another provider reject the request.
+
+## Shared structured output
+
+Callers may request structured output with:
+
+```python
+RunConfig(
+    model_params={
+        "json_schema": {
+            "type": "object",
+            "properties": {
+                "summary": {"type": "string"},
+                "recommendations": {"type": "array", "items": {"type": "string"}},
+            },
+            "required": ["summary", "recommendations"],
+        }
+    }
+)
+```
+
+Adapters translate that key into the provider's native shape:
+
+| Adapter | Translation |
+|---|---|
+| OpenAI | `response_format = {"type": "json_schema", "json_schema": ...}` |
+| OpenRouter | Same OpenAI-compatible `response_format` wrapper |
+| Gemini | `generationConfig.responseMimeType = "application/json"` and `generationConfig.responseSchema = ...` |
+| Claude Code CLI | `--json-schema <schema>` plus `--output-format json`, then envelope unwrap |
+
+OpenAI-compatible adapters default `json_schema.strict` to `False`. Strict mode
+requires schemas to meet provider-specific constraints such as
+`additionalProperties: false` on object nodes and complete `required` lists.
+Callers that need strict behavior can pass an explicit provider-native
+`response_format` in `model_params`.
+
+## Pass-through keys
+
+OpenAI and OpenRouter pass through known Chat Completions fields:
+
+`top_p`, `n`, `stream`, `stop`, `presence_penalty`, `frequency_penalty`,
+`logit_bias`, `user`, `seed`, `tools`, `tool_choice`, `response_format`,
+`logprobs`, `top_logprobs`, and `parallel_tool_calls`.
+
+Gemini passes through valid `generateContent` top-level fields:
+
+`safetySettings`, `tools`, `toolConfig`, `systemInstruction`, and
+`cachedContent`.
+
+Gemini also accepts generation config fields directly or via snake-case aliases:
+
+`candidateCount`, `candidate_count`, `stopSequences`, `stop_sequences`,
+`maxOutputTokens`, `max_output_tokens`, `temperature`, `topP`, `top_p`, `topK`,
+`top_k`, `responseMimeType`, `response_mime_type`, `responseSchema`, and
+`response_schema`.
+
+## Dropped keys
+
+Adapters must drop keys that are meaningful to another adapter or to
+llm-connect itself but invalid for the target provider. The current shared drop
+set includes:
+
+`reasoning_effort`, `max_depth`, `claude_cli_path`, and raw `json_schema` after
+translation.
+
+Unknown keys are ignored by default. This keeps activity-specific configs from
+causing provider HTTP 400 errors when a caller switches providers.
+
+## Diagnostics and replay
+
+Server mode supports opt-in diagnostics for `/execute`:
+
+```bash
+LLM_CONNECT_DEBUG=1 python -m llm_connect.server --provider openrouter
+curl 'http://127.0.0.1:8080/execute?debug=1' -d '{"prompt":"hi"}'
+```
+
+Debug responses include a `debug` field with the redacted provider request, raw
+provider response body, and adapter transformations such as `merge_model_params`
+or `unwrap_cli_envelope`. Normal responses omit `debug`.
+
+Set `LLM_CONNECT_AUDIT_DIR=/path/to/audit` to write one JSON audit record per
+`/execute` call. Audit records include the prompt, config, redacted provider
+request, provider response, parsed content, and latency. Re-run parsing without
+another provider call with:
+
+```bash
+python -m llm_connect.replay /path/to/audit/record.json --json
+```
+
+## Server concurrency
+
+`llm_connect.server.LLMServer` uses `ThreadingHTTPServer`. Adapter instances
+used in server mode must be safe to call concurrently. The bundled HTTP and
+subprocess adapters keep per-call state local; custom adapters should avoid
+mutating shared instance attributes during `execute_prompt` unless they use
+their own locks.
diff --git a/llm_connect/_diagnostics.py b/llm_connect/_diagnostics.py
new file mode 100644
index 0000000..3215e87
--- /dev/null
+++ b/llm_connect/_diagnostics.py
@@ -0,0 +1,153 @@
+"""Per-call diagnostics capture for server debug and audit modes."""
+
+from __future__ import annotations
+
+import copy
+import json
+from contextlib import contextmanager
+from contextvars import ContextVar
+from dataclasses import dataclass, field
+from typing import Any, Iterator, Mapping
+from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
+
+
+_SECRET_QUERY_KEYS = {"key", "api_key", "apikey", "access_token", "token"}
+_SECRET_HEADER_TOKENS = ("authorization", "api-key", "apikey", "token", "secret", "key")
+
+
+@dataclass
+class Diagnostics:
+    """Captured provider request/response details for one logical LLM call."""
+
+    provider_request: dict[str, Any] | None = None
+    provider_response: dict[str, Any] | None = None
+    adapter_transformations: list[dict[str, Any]] = field(default_factory=list)
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "provider_request": self.provider_request,
+            "provider_response": self.provider_response,
+            "adapter_transformations": self.adapter_transformations,
+        }
+
+
+_CURRENT: ContextVar[Diagnostics | None] = ContextVar(
+    "llm_connect_diagnostics",
+    default=None,
+)
+
+
+@contextmanager
+def capture_diagnostics(enabled: bool = True) -> Iterator[Diagnostics | None]:
+    """Capture diagnostics within this context when *enabled* is true."""
+
+    if not enabled:
+        yield None
+        return
+
+    diagnostics = Diagnostics()
+    token = _CURRENT.set(diagnostics)
+    try:
+        yield diagnostics
+    finally:
+        _CURRENT.reset(token)
+
+
+def diagnostics_enabled() -> bool:
+    return _CURRENT.get() is not None
+
+
+def current_diagnostics() -> Diagnostics | None:
+    return _CURRENT.get()
+
+
+def record_provider_request(
+    *,
+    url: str | None = None,
+    payload: Any | None = None,
+    headers: Mapping[str, Any] | None = None,
+    command: list[str] | None = None,
+) -> None:
+    diagnostics = _CURRENT.get()
+    if diagnostics is None:
+        return
+
+    request: dict[str, Any] = {}
+    if url is not None:
+        request["url"] = redact_url(url)
+    if payload is not None:
+        request["payload"] = json_safe(payload)
+    if headers is not None:
+        request["headers_redacted"] = redact_headers(headers)
+    if command is not None:
+        request["command"] = list(command)
+    diagnostics.provider_request = request
+
+
+def record_provider_response(*, status: int | None = None, body: Any | None = None) -> None:
+    diagnostics = _CURRENT.get()
+    if diagnostics is None:
+        return
+
+    response: dict[str, Any] = {}
+    if status is not None:
+        response["status"] = status
+    if body is not None:
+        response["body"] = json_safe(body)
+    diagnostics.provider_response = response
+
+
+def record_adapter_transformation(step: str, before: Any, after: Any) -> None:
+    diagnostics = _CURRENT.get()
+    if diagnostics is None:
+        return
+
+    diagnostics.adapter_transformations.append(
+        {
+            "step": step,
+            "before": json_safe(before),
+            "after": json_safe(after),
+        }
+    )
+
+
+def json_safe(value: Any) -> Any:
+    """Return a JSON-serializable snapshot of *value* without mutating it."""
+
+    try:
+        return json.loads(json.dumps(value))
+    except (TypeError, ValueError):
+        try:
+            return copy.deepcopy(value)
+        except Exception:
+            return repr(value)
+
+
+def redact_headers(headers: Mapping[str, Any]) -> dict[str, Any]:
+    redacted: dict[str, Any] = {}
+    for key, value in headers.items():
+        lowered = str(key).lower()
+        if any(token in lowered for token in _SECRET_HEADER_TOKENS):
+            redacted[str(key)] = _redact_header_value(value)
+        else:
+            redacted[str(key)] = json_safe(value)
+    return redacted
+
+
+def redact_url(url: str) -> str:
+    parts = urlsplit(url)
+    query = []
+    for key, value in parse_qsl(parts.query, keep_blank_values=True):
+        if key.lower() in _SECRET_QUERY_KEYS:
+            query.append((key, "<redacted>"))
+        else:
+            query.append((key, value))
+    return urlunsplit((parts.scheme, parts.netloc, parts.path, urlencode(query), parts.fragment))
+
+
+def _redact_header_value(value: Any) -> str:
+    text = str(value)
+    if " " in text:
+        scheme = text.split(" ", 1)[0]
+        return f"{scheme} <redacted>"
+    return "<redacted>"
diff --git a/llm_connect/_http.py b/llm_connect/_http.py
index d9429dc..4d3f66a 100644
--- a/llm_connect/_http.py
+++ b/llm_connect/_http.py
@@ -1,86 +1,101 @@
-"""
-Thin synchronous HTTP helper built on :mod:`urllib.request`.
-
-Translates HTTP errors into typed :mod:`markitect.llm.exceptions`.
-"""
-
-import json
-import urllib.request
-import urllib.error
-from typing import Dict, Any, Optional
-
-from llm_connect.exceptions import (
-    LLMAPIError,
-    LLMRateLimitError,
-    LLMTimeoutError,
-)
-
-
-def post_json(
-    url: str,
-    payload: Dict[str, Any],
-    headers: Optional[Dict[str, str]] = None,
-    timeout: int = 300,
-) -> Dict[str, Any]:
-    """POST *payload* as JSON and return the parsed response body.
-
-    Raises:
-        LLMRateLimitError: on HTTP 429
-        LLMAPIError: on other non-2xx responses
-        LLMTimeoutError: on socket / read timeout
-    """
-    data = json.dumps(payload).encode()
-    req = urllib.request.Request(
-        url,
-        data=data,
-        headers={"Content-Type": "application/json", **(headers or {})},
-        method="POST",
-    )
-
-    try:
-        with urllib.request.urlopen(req, timeout=timeout) as resp:
-            body = resp.read().decode()
-            try:
-                return json.loads(body)
-            except json.JSONDecodeError as exc:
-                preview = body[:300].replace("\n", "\\n")
-                raise LLMAPIError(
-                    f"Invalid JSON response from {url}: {exc} — body preview: {preview!r}",
-                    cause=exc,
-                ) from exc
-    except urllib.error.HTTPError as exc:
-        body = ""
-        try:
-            body = exc.read().decode()
-        except Exception:
-            pass
-
-        if exc.code == 429:
-            raise LLMRateLimitError(
-                f"Rate limited (429) from {url}",
-                status_code=429,
-                response_body=body,
-                cause=exc,
-            ) from exc
-
-        raise LLMAPIError(
-            f"HTTP {exc.code} from {url}",
-            status_code=exc.code,
-            response_body=body,
-            cause=exc,
-        ) from exc
-    except urllib.error.URLError as exc:
-        if "timed out" in str(exc.reason):
-            raise LLMTimeoutError(
-                f"Request to {url} timed out after {timeout}s",
-                cause=exc,
-            ) from exc
-        raise LLMAPIError(
-            f"URL error for {url}: {exc.reason}",
-            cause=exc,
-        ) from exc
-    except TimeoutError as exc:
-        raise LLMTimeoutError(
-            f"Request to {url} timed out after {timeout}s",
-            cause=exc,
-        ) from exc
+"""
+Thin synchronous HTTP helper built on :mod:`urllib.request`.
+
+Translates HTTP errors into typed :mod:`markitect.llm.exceptions`.
+"""
+
+import json
+import urllib.error
+import urllib.request
+from typing import Any, Dict, Optional
+
+from llm_connect._diagnostics import record_provider_request, record_provider_response
+from llm_connect.exceptions import (
+    LLMAPIError,
+    LLMRateLimitError,
+    LLMTimeoutError,
+)
+
+
+def post_json(
+    url: str,
+    payload: Dict[str, Any],
+    headers: Optional[Dict[str, str]] = None,
+    timeout: int = 300,
+) -> Dict[str, Any]:
+    """POST *payload* as JSON and return the parsed response body.
+
+    Raises:
+        LLMRateLimitError: on HTTP 429
+        LLMAPIError: on other non-2xx responses
+        LLMTimeoutError: on socket / read timeout
+    """
+    record_provider_request(url=url, payload=payload, headers=headers or {})
+    data = json.dumps(payload).encode()
+    req = urllib.request.Request(
+        url,
+        data=data,
+        headers={"Content-Type": "application/json", **(headers or {})},
+        method="POST",
+    )
+
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            body = resp.read().decode()
+            try:
+                parsed = json.loads(body)
+                record_provider_response(status=resp.status, body=parsed)
+                return parsed
+            except json.JSONDecodeError as exc:
+                record_provider_response(status=resp.status, body=body)
+                preview = body[:300].replace("\n", "\\n")
+                raise LLMAPIError(
+                    f"Invalid JSON response from {url}: {exc} - body preview: {preview!r}",
+                    cause=exc,
+                ) from exc
+    except urllib.error.HTTPError as exc:
+        body = ""
+        try:
+            body = exc.read().decode()
+        except Exception:
+            pass
+        record_provider_response(status=exc.code, body=_json_or_text(body))
+
+        if exc.code == 429:
+            raise LLMRateLimitError(
+                f"Rate limited (429) from {url}",
+                status_code=429,
+                response_body=body,
+                cause=exc,
+            ) from exc
+
+        raise LLMAPIError(
+            f"HTTP {exc.code} from {url}",
+            status_code=exc.code,
+            response_body=body,
+            cause=exc,
+        ) from exc
+    except urllib.error.URLError as exc:
+        record_provider_response(body={"error": str(exc.reason)})
+        if "timed out" in str(exc.reason):
+            raise LLMTimeoutError(
+                f"Request to {url} timed out after {timeout}s",
+                cause=exc,
+            ) from exc
+        raise LLMAPIError(
+            f"URL error for {url}: {exc.reason}",
+            cause=exc,
+        ) from exc
+    except TimeoutError as exc:
+        record_provider_response(body={"error": "timeout"})
+        raise LLMTimeoutError(
+            f"Request to {url} timed out after {timeout}s",
+            cause=exc,
+        ) from exc
+
+
+def _json_or_text(body: str) -> Any:
+    try:
+        return json.loads(body)
+    except (TypeError, ValueError):
+        return body
diff --git a/llm_connect/_payload.py b/llm_connect/_payload.py
new file mode 100644
index 0000000..74d5c75
--- /dev/null
+++ b/llm_connect/_payload.py
@@ -0,0 +1,154 @@
+"""Provider payload helpers for translating ``RunConfig.model_params``."""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+from llm_connect._diagnostics import (
+    diagnostics_enabled,
+    json_safe,
+    record_adapter_transformation,
+)
+
+
+# OpenAI Chat Completions fields that map straight through from model_params.
+# Anything not in this set is provider-specific and must be either translated
+# or dropped. Blind merges are deliberately avoided because OpenAI-compatible
+# providers commonly reject unknown top-level fields with HTTP 400.
+OPENAI_CHAT_PASSTHROUGH_FIELDS = frozenset(
+    {
+        "top_p",
+        "n",
+        "stream",
+        "stop",
+        "presence_penalty",
+        "frequency_penalty",
+        "logit_bias",
+        "user",
+        "seed",
+        "tools",
+        "tool_choice",
+        "response_format",
+        "logprobs",
+        "top_logprobs",
+        "parallel_tool_calls",
+    }
+)
+
+
+DROPPED_NON_OPENAI_FIELDS = frozenset(
+    {
+        "reasoning_effort",
+        "max_depth",
+        "claude_cli_path",
+        "json_schema",
+    }
+)
+
+
+GEMINI_TOP_LEVEL_FIELDS = frozenset(
+    {
+        "safetySettings",
+        "tools",
+        "toolConfig",
+        "systemInstruction",
+        "cachedContent",
+    }
+)
+
+
+GEMINI_GENERATION_CONFIG_FIELDS = frozenset(
+    {
+        "candidateCount",
+        "stopSequences",
+        "maxOutputTokens",
+        "temperature",
+        "topP",
+        "topK",
+        "responseMimeType",
+        "responseSchema",
+    }
+)
+
+
+GEMINI_GENERATION_CONFIG_ALIASES = {
+    "candidate_count": "candidateCount",
+    "stop_sequences": "stopSequences",
+    "max_output_tokens": "maxOutputTokens",
+    "top_p": "topP",
+    "top_k": "topK",
+    "response_mime_type": "responseMimeType",
+    "response_schema": "responseSchema",
+}
+
+
+def merge_openai_chat_model_params(payload: dict[str, Any], model_params: dict[str, Any]) -> None:
+    """Merge model_params into an OpenAI Chat Completions-style payload.
+
+    Translates ``json_schema`` to ``response_format``, passes known OpenAI
+    fields through, and drops Claude/llm-connect-only knobs.
+    """
+
+    before = json_safe(payload) if diagnostics_enabled() else None
+
+    schema = _coerce_json_schema(model_params.get("json_schema"))
+    caller_response_format = model_params.get("response_format")
+    if schema is not None and caller_response_format is None and "response_format" not in payload:
+        payload["response_format"] = {
+            "type": "json_schema",
+            "json_schema": {
+                "name": "structured_output",
+                "schema": schema,
+                "strict": False,
+            },
+        }
+
+    for key, value in model_params.items():
+        if key in DROPPED_NON_OPENAI_FIELDS:
+            continue
+        if key in OPENAI_CHAT_PASSTHROUGH_FIELDS:
+            payload[key] = value
+
+    if before is not None:
+        record_adapter_transformation("merge_model_params.openai_chat", before, payload)
+
+
+def merge_gemini_model_params(payload: dict[str, Any], model_params: dict[str, Any]) -> None:
+    """Merge model_params into a Gemini ``generateContent`` payload."""
+
+    before = json_safe(payload) if diagnostics_enabled() else None
+    generation_config = payload.setdefault("generationConfig", {})
+
+    schema = _coerce_json_schema(model_params.get("json_schema"))
+    if schema is not None and "responseSchema" not in generation_config:
+        generation_config["responseMimeType"] = "application/json"
+        generation_config["responseSchema"] = schema
+
+    explicit_generation_config = model_params.get("generationConfig")
+    if isinstance(explicit_generation_config, dict):
+        generation_config.update(explicit_generation_config)
+
+    for key, value in model_params.items():
+        if key in {"json_schema", "generationConfig", "reasoning_effort", "max_depth"}:
+            continue
+        if key in GEMINI_TOP_LEVEL_FIELDS:
+            payload[key] = value
+            continue
+        gemini_key = GEMINI_GENERATION_CONFIG_ALIASES.get(key, key)
+        if gemini_key in GEMINI_GENERATION_CONFIG_FIELDS:
+            generation_config[gemini_key] = value
+
+    if before is not None:
+        record_adapter_transformation("merge_model_params.gemini", before, payload)
+
+
+def _coerce_json_schema(schema: Any) -> dict[str, Any] | None:
+    if isinstance(schema, str):
+        try:
+            schema = json.loads(schema)
+        except (TypeError, ValueError):
+            return None
+    if isinstance(schema, dict):
+        return schema
+    return None
diff --git a/llm_connect/claude_code.py b/llm_connect/claude_code.py
index 6a9c08f..95e383c 100644
--- a/llm_connect/claude_code.py
+++ b/llm_connect/claude_code.py
@@ -1,277 +1,289 @@
-"""
-Claude Code CLI adapter — runs the ``claude`` CLI as a subprocess.
-"""
-
-import asyncio
-import json
-import os
-import subprocess
-from pathlib import Path
-from typing import Optional
-
-from llm_connect.adapter import LLMAdapter
-from llm_connect.models import RunConfig, LLMResponse
-from llm_connect.config import LLMConfig
-from llm_connect._token_estimator import estimate_tokens
-from llm_connect.exceptions import (
-    LLMSubprocessError,
-    LLMTimeoutError,
-)
-
-
-class ClaudeCodeAdapter(LLMAdapter):
-    """LLM adapter that shells out to the ``claude`` CLI with ``--print``.
-
-    The compiled prompt is piped via **stdin** to avoid shell argument
-    length limits (compiled prompts can exceed 30 KB).
-    """
-
-    def __init__(
-        self,
-        cli_path: Optional[str] = None,
-        model: Optional[str] = None,
-        config: Optional[LLMConfig] = None,
-    ):
-        self._config = config or LLMConfig(provider="claude-code")
-        self._cli_path = cli_path or self._resolve_cli_path()
-        self._model = model
-
-    # ── LLMAdapter interface ────────────────────────────────────────
-
-    def execute_prompt(self, prompt: str, config: RunConfig) -> LLMResponse:
-        self._preflight_budget(config)
-        cmd = self._build_command(config)
-
-        timeout = config.timeout_seconds or self._config.timeout_seconds
-
-        try:
-            result = subprocess.run(
-                cmd,
-                input=prompt,
-                capture_output=True,
-                text=True,
-                timeout=timeout,
-            )
-        except subprocess.TimeoutExpired as exc:
-            raise LLMTimeoutError(
-                f"claude CLI timed out after {timeout}s",
-                cause=exc,
-            ) from exc
-
-        if result.returncode != 0:
-            raise LLMSubprocessError(
-                f"claude CLI exited with code {result.returncode}",
-                return_code=result.returncode,
-                stderr=result.stderr,
-            )
-
-        content = _unwrap_cli_json_envelope(result.stdout, config)
-        prompt_tokens = estimate_tokens(prompt)
-        completion_tokens = estimate_tokens(content)
-
-        response = LLMResponse(
-            content=content,
-            model=self._model or "claude-code-cli",
-            usage={
-                "prompt_tokens": prompt_tokens,
-                "completion_tokens": completion_tokens,
-                "total_tokens": prompt_tokens + completion_tokens,
-            },
-            finish_reason="stop",
-            metadata={
-                "provider": "claude-code",
-                "cli_path": self._cli_path,
-            },
-        )
-        self._consume_budget(config, response)
-        return response
-
-    async def async_execute_prompt(self, prompt: str, config: RunConfig) -> LLMResponse:
-        """Native async implementation using asyncio.create_subprocess_exec."""
-        self._preflight_budget(config)
-        cmd = self._build_command(config)
-
-        timeout = config.timeout_seconds or self._config.timeout_seconds
-
-        try:
-            proc = await asyncio.create_subprocess_exec(
-                *cmd,
-                stdin=asyncio.subprocess.PIPE,
-                stdout=asyncio.subprocess.PIPE,
-                stderr=asyncio.subprocess.PIPE,
-            )
-            stdout_bytes, stderr_bytes = await asyncio.wait_for(
-                proc.communicate(input=prompt.encode()),
-                timeout=timeout,
-            )
-        except asyncio.TimeoutError as exc:
-            raise LLMTimeoutError(
-                f"claude CLI timed out after {timeout}s",
-                cause=exc,
-            ) from exc
-
-        if proc.returncode != 0:
-            raise LLMSubprocessError(
-                f"claude CLI exited with code {proc.returncode}",
-                return_code=proc.returncode,
-                stderr=stderr_bytes.decode(),
-            )
-
-        content = _unwrap_cli_json_envelope(stdout_bytes.decode(), config)
-        prompt_tokens = estimate_tokens(prompt)
-        completion_tokens = estimate_tokens(content)
-
-        response = LLMResponse(
-            content=content,
-            model=self._model or "claude-code-cli",
-            usage={
-                "prompt_tokens": prompt_tokens,
-                "completion_tokens": completion_tokens,
-                "total_tokens": prompt_tokens + completion_tokens,
-            },
-            finish_reason="stop",
-            metadata={
-                "provider": "claude-code",
-                "cli_path": self._cli_path,
-                "async": True,
-            },
-        )
-        self._consume_budget(config, response)
-        return response
-
-    def validate_config(self, config: RunConfig) -> bool:
-        try:
-            result = subprocess.run(
-                [self._cli_path, "--version"],
-                capture_output=True,
-                text=True,
-                timeout=10,
-            )
-            return result.returncode == 0
-        except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
-            return False
-
-    def _build_command(self, config: RunConfig) -> list[str]:
-        cmd = [self._cli_path, "--print"]
-        if self._model:
-            cmd.extend(["--model", self._model])
-
-        json_schema = _json_schema_arg(config)
-        if json_schema:
-            cmd.extend(["--json-schema", json_schema])
-            # With --json-schema alone the CLI prints conversational text on
-            # stdout while the structured payload ships on a sidecar channel
-            # callers cannot reach. --output-format json forces the structured
-            # response (wrapped in an envelope) onto stdout.
-            cmd.extend(["--output-format", "json"])
-        return cmd
-
-    def _resolve_cli_path(self) -> str:
-        configured = (
-            os.environ.get("LLM_CONNECT_CLAUDE_CLI_PATH")
-            or os.environ.get("CLAUDE_CLI_PATH")
-            or self._config.claude_cli_path
-        )
-        if configured and configured != "claude":
-            return configured
-
-        local_cli = Path.home() / ".local" / "bin" / "claude"
-        if local_cli.exists():
-            return str(local_cli)
-        return configured or "claude"
-
-
-def _json_schema_arg(config: RunConfig) -> str | None:
-    schema = (config.model_params or {}).get("json_schema")
-    if not schema:
-        return None
-    if isinstance(schema, str):
-        return schema
-    if isinstance(schema, dict):
-        return json.dumps(schema, separators=(",", ":"))
-    return None
-
-
-# Envelope field names Claude Code's `--output-format json` is known to use
-# for the model's primary textual response. Used as a fall-back when no field
-# carries a JSON-parseable payload (e.g. plain prose generation).
-_ENVELOPE_TEXT_FIELDS = ("result", "result_text", "content", "text", "output")
-
-
-def _unwrap_cli_json_envelope(stdout: str, config: RunConfig) -> str:
-    """Extract the model's payload from Claude CLI's --output-format json envelope.
-
-    Only runs when --json-schema was set (the only code path that adds
-    --output-format json to the CLI invocation). Other callers keep the raw
-    stdout behavior unchanged.
-
-    Strategy: when --json-schema is set the caller wants JSON back, so prefer
-    any envelope field whose value is itself valid JSON (dict, list, or a
-    string that parses as JSON). This handles two observed envelope shapes:
-
-    1. Short prompts where the model emits the structured payload directly
-       in the `result` field as a JSON-encoded string.
-    2. Longer prompts where the model emits a conversational preamble in
-       `result` and the schema-enforced JSON in a separate field (the exact
-       field name varies across CLI versions).
-
-    Fall back to the first text field only when no JSON-bearing field exists,
-    so non-schema callers via this code path still see the model's prose.
-    Surface the raw envelope as a last resort so the operator can see what
-    shape arrived and extend the strategy.
-    """
-    if not _json_schema_arg(config):
-        return stdout
-    text = stdout.strip()
-    if not text:
-        return stdout
-    try:
-        envelope = json.loads(text)
-    except json.JSONDecodeError:
-        return stdout
-    if not isinstance(envelope, dict):
-        return stdout
-
-    json_payload = _find_json_payload(envelope)
-    if json_payload is not None:
-        return json_payload
-
-    for key in _ENVELOPE_TEXT_FIELDS:
-        value = envelope.get(key)
-        if isinstance(value, str):
-            return value
-        if isinstance(value, (dict, list)):
-            return json.dumps(value)
-
-    return stdout
-
-
-def _find_json_payload(envelope: dict) -> str | None:
-    """Return the first envelope value that represents valid JSON.
-
-    Insertion order is preserved by Python dicts, so this prefers fields the
-    CLI lists earliest in its envelope. Skips obvious metadata keys (cost,
-    usage, timing) so we never accidentally pick a numeric or telemetry value.
-    """
-    for key, value in envelope.items():
-        if key in _ENVELOPE_METADATA_KEYS:
-            continue
-        if isinstance(value, (dict, list)):
-            return json.dumps(value)
-        if isinstance(value, str):
-            stripped = value.strip()
-            if stripped.startswith(("{", "[")):
-                try:
-                    json.loads(stripped)
-                except json.JSONDecodeError:
-                    continue
-                return stripped
-    return None
-
-
-# Envelope keys that carry telemetry, never the model payload.
-_ENVELOPE_METADATA_KEYS = frozenset({
-    "type", "subtype", "model", "usage", "total_cost_usd", "cost_usd",
-    "duration_ms", "duration_api_ms", "num_turns", "session_id",
-    "is_error", "stop_reason", "permission_denials", "uuid",
-})
+"""
+Claude Code CLI adapter - runs the ``claude`` CLI as a subprocess.
+"""
+
+import asyncio
+import json
+import os
+import subprocess
+from pathlib import Path
+from typing import Optional
+
+from llm_connect._diagnostics import (
+    record_adapter_transformation,
+    record_provider_request,
+    record_provider_response,
+)
+from llm_connect._token_estimator import estimate_tokens
+from llm_connect.adapter import LLMAdapter
+from llm_connect.config import LLMConfig
+from llm_connect.exceptions import LLMSubprocessError, LLMTimeoutError
+from llm_connect.models import LLMResponse, RunConfig
+
+
+class ClaudeCodeAdapter(LLMAdapter):
+    """LLM adapter that shells out to the ``claude`` CLI with ``--print``.
+
+    The compiled prompt is piped via stdin to avoid shell argument length
+    limits. Compiled prompts can exceed 30 KB.
+    """
+
+    def __init__(
+        self,
+        cli_path: Optional[str] = None,
+        model: Optional[str] = None,
+        config: Optional[LLMConfig] = None,
+    ):
+        self._config = config or LLMConfig(provider="claude-code")
+        self._cli_path = cli_path or self._resolve_cli_path()
+        self._model = model
+
+    # LLMAdapter interface
+
+    def execute_prompt(self, prompt: str, config: RunConfig) -> LLMResponse:
+        self._preflight_budget(config)
+        cmd = self._build_command(config)
+
+        timeout = config.timeout_seconds or self._config.timeout_seconds
+        record_provider_request(command=cmd, payload={"stdin": prompt})
+
+        try:
+            result = subprocess.run(
+                cmd,
+                input=prompt,
+                capture_output=True,
+                text=True,
+                timeout=timeout,
+            )
+        except subprocess.TimeoutExpired as exc:
+            raise LLMTimeoutError(
+                f"claude CLI timed out after {timeout}s",
+                cause=exc,
+            ) from exc
+
+        record_provider_response(
+            status=result.returncode,
+            body={"stdout": result.stdout, "stderr": result.stderr},
+        )
+        if result.returncode != 0:
+            raise LLMSubprocessError(
+                f"claude CLI exited with code {result.returncode}",
+                return_code=result.returncode,
+                stderr=result.stderr,
+            )
+
+        content = _unwrap_cli_json_envelope(result.stdout, config)
+        prompt_tokens = estimate_tokens(prompt)
+        completion_tokens = estimate_tokens(content)
+
+        response = LLMResponse(
+            content=content,
+            model=self._model or "claude-code-cli",
+            usage={
+                "prompt_tokens": prompt_tokens,
+                "completion_tokens": completion_tokens,
+                "total_tokens": prompt_tokens + completion_tokens,
+            },
+            finish_reason="stop",
+            metadata={
+                "provider": "claude-code",
+                "cli_path": self._cli_path,
+            },
+        )
+        self._consume_budget(config, response)
+        return response
+
+    async def async_execute_prompt(self, prompt: str, config: RunConfig) -> LLMResponse:
+        """Native async implementation using asyncio.create_subprocess_exec."""
+        self._preflight_budget(config)
+        cmd = self._build_command(config)
+
+        timeout = config.timeout_seconds or self._config.timeout_seconds
+        record_provider_request(command=cmd, payload={"stdin": prompt})
+
+        try:
+            proc = await asyncio.create_subprocess_exec(
+                *cmd,
+                stdin=asyncio.subprocess.PIPE,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+            )
+            stdout_bytes, stderr_bytes = await asyncio.wait_for(
+                proc.communicate(input=prompt.encode()),
+                timeout=timeout,
+            )
+        except asyncio.TimeoutError as exc:
+            raise LLMTimeoutError(
+                f"claude CLI timed out after {timeout}s",
+                cause=exc,
+            ) from exc
+
+        stdout = stdout_bytes.decode()
+        stderr = stderr_bytes.decode()
+        record_provider_response(
+            status=proc.returncode,
+            body={"stdout": stdout, "stderr": stderr},
+        )
+        if proc.returncode != 0:
+            raise LLMSubprocessError(
+                f"claude CLI exited with code {proc.returncode}",
+                return_code=proc.returncode,
+                stderr=stderr,
+            )
+
+        content = _unwrap_cli_json_envelope(stdout, config)
+        prompt_tokens = estimate_tokens(prompt)
+        completion_tokens = estimate_tokens(content)
+
+        response = LLMResponse(
+            content=content,
+            model=self._model or "claude-code-cli",
+            usage={
+                "prompt_tokens": prompt_tokens,
+                "completion_tokens": completion_tokens,
+                "total_tokens": prompt_tokens + completion_tokens,
+            },
+            finish_reason="stop",
+            metadata={
+                "provider": "claude-code",
+                "cli_path": self._cli_path,
+                "async": True,
+            },
+        )
+        self._consume_budget(config, response)
+        return response
+
+    def validate_config(self, config: RunConfig) -> bool:
+        try:
+            result = subprocess.run(
+                [self._cli_path, "--version"],
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+            return result.returncode == 0
+        except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
+            return False
+
+    def _build_command(self, config: RunConfig) -> list[str]:
+        cmd = [self._cli_path, "--print"]
+        if self._model:
+            cmd.extend(["--model", self._model])
+
+        json_schema = _json_schema_arg(config)
+        if json_schema:
+            cmd.extend(["--json-schema", json_schema])
+            # With --json-schema alone the CLI prints conversational text on
+            # stdout while the structured payload ships on a sidecar channel
+            # callers cannot reach. --output-format json forces the structured
+            # response (wrapped in an envelope) onto stdout.
+            cmd.extend(["--output-format", "json"])
+        return cmd
+
+    def _resolve_cli_path(self) -> str:
+        configured = (
+            os.environ.get("LLM_CONNECT_CLAUDE_CLI_PATH")
+            or os.environ.get("CLAUDE_CLI_PATH")
+            or self._config.claude_cli_path
+        )
+        if configured and configured != "claude":
+            return configured
+
+        local_cli = Path.home() / ".local" / "bin" / "claude"
+        if local_cli.exists():
+            return str(local_cli)
+        return configured or "claude"
+
+
+def _json_schema_arg(config: RunConfig) -> str | None:
+    schema = (config.model_params or {}).get("json_schema")
+    if not schema:
+        return None
+    if isinstance(schema, str):
+        return schema
+    if isinstance(schema, dict):
+        return json.dumps(schema, separators=(",", ":"))
+    return None
+
+
+# Envelope field names Claude Code's --output-format json is known to use for
+# the model's primary textual response. Used as a fallback when no field carries
+# a JSON-parseable payload, such as plain prose generation.
+_ENVELOPE_TEXT_FIELDS = ("result", "result_text", "content", "text", "output")
+
+
+def _unwrap_cli_json_envelope(stdout: str, config: RunConfig) -> str:
+    """Extract the model's payload from Claude CLI's --output-format json envelope.
+
+    Only runs when --json-schema was set. Other callers keep the raw stdout
+    behavior unchanged.
+    """
+    if not _json_schema_arg(config):
+        return stdout
+    text = stdout.strip()
+    if not text:
+        return stdout
+    try:
+        envelope = json.loads(text)
+    except json.JSONDecodeError:
+        return stdout
+    if not isinstance(envelope, dict):
+        return stdout
+
+    json_payload = _find_json_payload(envelope)
+    if json_payload is not None:
+        return _record_unwrap(stdout, json_payload)
+
+    for key in _ENVELOPE_TEXT_FIELDS:
+        value = envelope.get(key)
+        if isinstance(value, str):
+            return _record_unwrap(stdout, value)
+        if isinstance(value, (dict, list)):
+            return _record_unwrap(stdout, json.dumps(value))
+
+    return stdout
+
+
+def _find_json_payload(envelope: dict) -> str | None:
+    """Return the first envelope value that represents valid JSON."""
+    for key, value in envelope.items():
+        if key in _ENVELOPE_METADATA_KEYS:
+            continue
+        if isinstance(value, (dict, list)):
+            return json.dumps(value)
+        if isinstance(value, str):
+            stripped = value.strip()
+            if stripped.startswith(("{", "[")):
+                try:
+                    json.loads(stripped)
+                except json.JSONDecodeError:
+                    continue
+                return stripped
+    return None
+
+
+# Envelope keys that carry telemetry, never the model payload.
+_ENVELOPE_METADATA_KEYS = frozenset(
+    {
+        "type",
+        "subtype",
+        "model",
+        "usage",
+        "total_cost_usd",
+        "cost_usd",
+        "duration_ms",
+        "duration_api_ms",
+        "num_turns",
+        "session_id",
+        "is_error",
+        "stop_reason",
+        "permission_denials",
+        "uuid",
+    }
+)
+
+
+def _record_unwrap(stdout: str, content: str) -> str:
+    if content != stdout:
+        record_adapter_transformation("unwrap_cli_envelope", stdout, content)
+    return content
diff --git a/llm_connect/gemini.py b/llm_connect/gemini.py
index 7d5db8d..c6e674c 100644
--- a/llm_connect/gemini.py
+++ b/llm_connect/gemini.py
@@ -9,6 +9,7 @@ from llm_connect.adapter import LLMAdapter
 from llm_connect.models import RunConfig, LLMResponse
 from llm_connect.config import resolve_api_key, find_project_root
 from llm_connect._http import post_json
+from llm_connect._payload import merge_gemini_model_params
 from llm_connect.exceptions import LLMConfigurationError
 
 _DEFAULT_MODEL = "gemini-2.5-flash"
@@ -74,6 +75,8 @@ class GeminiAdapter(LLMAdapter):
                 "maxOutputTokens": config.max_tokens,
             },
         }
+        if config.model_params:
+            merge_gemini_model_params(payload, config.model_params)
 
         url = f"{_API_BASE}/models/{model}:generateContent?key={self._api_key}"
 
diff --git a/llm_connect/openai.py b/llm_connect/openai.py
index c0c76d2..14e8e6f 100644
--- a/llm_connect/openai.py
+++ b/llm_connect/openai.py
@@ -9,6 +9,7 @@ from llm_connect.adapter import LLMAdapter
 from llm_connect.models import RunConfig, LLMResponse
 from llm_connect.config import resolve_api_key, find_project_root
 from llm_connect._http import post_json
+from llm_connect._payload import merge_openai_chat_model_params
 from llm_connect.exceptions import (
     LLMConfigurationError,
     LLMAPIError,
@@ -65,6 +66,8 @@ class OpenAIAdapter(LLMAdapter):
             "temperature": config.temperature,
             "max_tokens": config.max_tokens,
         }
+        if config.model_params:
+            merge_openai_chat_model_params(payload, config.model_params)
 
         headers = {
             "Authorization": f"Bearer {self._api_key}",
diff --git a/llm_connect/openrouter.py b/llm_connect/openrouter.py
index 623c5a8..c4027da 100644
--- a/llm_connect/openrouter.py
+++ b/llm_connect/openrouter.py
@@ -1,221 +1,151 @@
-"""
-OpenRouter adapter — calls the OpenAI-compatible chat completions API.
-"""
-
-import time
-from typing import Optional, Dict, Any
-
-from llm_connect.adapter import LLMAdapter
-from llm_connect.models import RunConfig, LLMResponse
-from llm_connect.config import LLMConfig, resolve_api_key, find_project_root
-from llm_connect._http import post_json
-from llm_connect.exceptions import (
-    LLMConfigurationError,
-    LLMAPIError,
-    LLMRateLimitError,
-)
-
-_DEFAULT_MODEL = "anthropic/claude-sonnet-4"
-
-
-class OpenRouterAdapter(LLMAdapter):
-    """LLM adapter that calls the OpenRouter chat completions endpoint.
-
-    Constructor args override values from *config*; *config* overrides
-    global defaults.  The model used for a given call is resolved as:
-    ``constructor model > RunConfig.model_name > default``.
-    """
-
-    def __init__(
-        self,
-        model: Optional[str] = None,
-        api_key: Optional[str] = None,
-        api_base: Optional[str] = None,
-        config: Optional[LLMConfig] = None,
-        system_prompt: Optional[str] = None,
-        extra_headers: Optional[Dict[str, str]] = None,
-        max_retries: Optional[int] = None,
-    ):
-        self._config = config or LLMConfig()
-        # Track whether the model was explicitly supplied (constructor or
-        # LLMConfig). Comparing self._model to _DEFAULT_MODEL is not enough —
-        # callers who pass --model anthropic/claude-sonnet-4 happen to match
-        # the default and would otherwise be misrouted to RunConfig.model_name
-        # (which defaults to "gpt-4" — quietly sending every call to OpenAI's
-        # gpt-4 model, which is what broke the activity-core CUST-WP-0045
-        # canary on 2026-06-02).
-        self._explicit_model = model is not None or self._config.model is not None
-        self._model = model or self._config.model or _DEFAULT_MODEL
-        self._api_base = (api_base or self._config.api_base).rstrip("/")
-        self._system_prompt = system_prompt
-        self._extra_headers = extra_headers or {}
-        self._max_retries = max_retries if max_retries is not None else self._config.max_retries
-
-        # Resolve API key
-        root = find_project_root()
-        key_file_paths = [root / "apikey-openrouter.txt"] if root else []
-        self._api_key = resolve_api_key(
-            explicit=api_key or self._config.api_key,
-            env_var="OPENROUTER_API_KEY",
-            key_file_paths=key_file_paths,
-        )
-
-    # ── LLMAdapter interface ────────────────────────────────────────
-
-    def execute_prompt(self, prompt: str, config: RunConfig) -> LLMResponse:
-        self._preflight_budget(config)
-        # Explicit constructor/LLMConfig model wins; only fall back to the
-        # per-call RunConfig.model_name when the adapter wasn't told what to
-        # use. RunConfig.model_name defaults to "gpt-4", so falling back
-        # unconditionally would silently misroute callers.
-        if self._explicit_model:
-            model = self._model
-        else:
-            model = config.model_name or self._model
-
-        messages: list[Dict[str, str]] = []
-        if self._system_prompt:
-            messages.append({"role": "system", "content": self._system_prompt})
-        messages.append({"role": "user", "content": prompt})
-
-        payload: Dict[str, Any] = {
-            "model": model,
-            "messages": messages,
-            "temperature": config.temperature,
-            "max_tokens": config.max_tokens,
-        }
-        if config.model_params:
-            _merge_model_params(payload, config.model_params)
-
-        headers = {
-            "Authorization": f"Bearer {self._api_key}",
-            **self._extra_headers,
-        }
-        url = f"{self._api_base}/chat/completions"
-
-        start = time.time()
-        data = self._post_with_retries(url, payload, headers, config.timeout_seconds)
-        latency = time.time() - start
-
-        # Parse response
-        choice = data.get("choices", [{}])[0]
-        content = choice.get("message", {}).get("content", "")
-        finish_reason = choice.get("finish_reason", "stop")
-        usage = data.get("usage", {})
-
-        response = LLMResponse(
-            content=content,
-            model=data.get("model", model),
-            usage={
-                "prompt_tokens": usage.get("prompt_tokens", 0),
-                "completion_tokens": usage.get("completion_tokens", 0),
-                "total_tokens": usage.get("total_tokens", 0),
-            },
-            finish_reason=finish_reason,
-            metadata={
-                "provider": "openrouter",
-                "latency_seconds": round(latency, 3),
-                "response_id": data.get("id", ""),
-            },
-        )
-        self._consume_budget(config, response)
-        return response
-
-    def validate_config(self, config: RunConfig) -> bool:
-        if not self._api_key:
-            return False
-        if not (self._model or config.model_name):
-            return False
-        if not (0.0 <= config.temperature <= 2.0):
-            return False
-        return True
-
-    # ── Internals ───────────────────────────────────────────────────
-
-    def _post_with_retries(
-        self,
-        url: str,
-        payload: Dict[str, Any],
-        headers: Dict[str, str],
-        timeout: int,
-    ) -> Dict[str, Any]:
-        last_exc: Optional[Exception] = None
-        for attempt in range(self._max_retries + 1):
-            try:
-                return post_json(url, payload, headers, timeout=timeout)
-            except LLMRateLimitError as exc:
-                last_exc = exc
-                if attempt < self._max_retries:
-                    time.sleep(2 ** attempt)
-            except LLMAPIError as exc:
-                if exc.status_code >= 500 and attempt < self._max_retries:
-                    last_exc = exc
-                    time.sleep(2 ** attempt)
-                else:
-                    raise
-        raise last_exc  # type: ignore[misc]
-
-
-# OpenAI Chat Completions fields that map straight through from model_params.
-# Anything not in this set is provider-specific and must be either translated
-# or dropped — we never blind-merge into the payload, because OpenRouter
-# rejects unknown top-level fields with HTTP 400.
-_OPENAI_PASSTHROUGH_FIELDS = frozenset({
-    "top_p", "n", "stream", "stop", "presence_penalty",
-    "frequency_penalty", "logit_bias", "user", "seed",
-    "tools", "tool_choice", "response_format",
-    "logprobs", "top_logprobs", "parallel_tool_calls",
-})
-
-# Provider-specific model_params keys that have no OpenAI Chat Completions
-# equivalent and must be silently dropped to keep payloads valid.
-_DROPPED_NON_OPENAI_FIELDS = frozenset({
-    "reasoning_effort",  # Claude CLI / Anthropic-specific
-    "max_depth",         # llm-connect's own depth knob
-    "claude_cli_path",   # adapter wiring leak
-    "json_schema",       # translated below into response_format
-})
-
-
-def _merge_model_params(payload: Dict[str, Any], model_params: Dict[str, Any]) -> None:
-    """Merge RunConfig.model_params into an OpenAI Chat Completions payload.
-
-    Pass-through whitelisted OpenAI keys, translate json_schema into the
-    proper response_format wrapper, drop known provider-specific fields,
-    and ignore anything else rather than letting it through and triggering
-    a 400 from OpenRouter (the failure mode that hit CUST-WP-0045 on
-    2026-06-02 — reasoning_effort and a top-level json_schema were merged
-    into the body and the API rejected both).
-    """
-    schema = model_params.get("json_schema")
-    if schema is not None and "response_format" not in payload:
-        if isinstance(schema, str):
-            try:
-                import json as _json
-                schema = _json.loads(schema)
-            except (ValueError, TypeError):
-                schema = None
-        if isinstance(schema, dict):
-            # strict=False: OpenAI's strict mode requires additionalProperties
-            # to be false on every object and every property in the required
-            # list. Most application-supplied schemas are not written that
-            # way (the activity-core daily-triage schema, for example, has
-            # neither). With strict=False, OpenRouter still honours the
-            # schema as a soft constraint and the model's output remains
-            # structured. Callers can opt back into strict by including
-            # `strict: true` themselves in a custom `response_format`.
-            payload["response_format"] = {
-                "type": "json_schema",
-                "json_schema": {
-                    "name": "structured_output",
-                    "schema": schema,
-                    "strict": False,
-                },
-            }
-
-    for key, value in model_params.items():
-        if key in _DROPPED_NON_OPENAI_FIELDS:
-            continue
-        if key in _OPENAI_PASSTHROUGH_FIELDS:
-            payload[key] = value
-        # else: silently drop unknown keys rather than risk a 400.
+"""
+OpenRouter adapter - calls the OpenAI-compatible chat completions API.
+"""
+
+import time
+from typing import Any, Dict, Optional
+
+from llm_connect._http import post_json
+from llm_connect._payload import merge_openai_chat_model_params
+from llm_connect.adapter import LLMAdapter
+from llm_connect.config import LLMConfig, find_project_root, resolve_api_key
+from llm_connect.exceptions import LLMAPIError, LLMRateLimitError
+from llm_connect.models import LLMResponse, RunConfig
+
+_DEFAULT_MODEL = "anthropic/claude-sonnet-4"
+
+
+class OpenRouterAdapter(LLMAdapter):
+    """LLM adapter that calls the OpenRouter chat completions endpoint.
+
+    Constructor args override values from *config*; *config* overrides
+    global defaults. The model used for a given call is resolved as:
+    ``constructor model > RunConfig.model_name > default``.
+    """
+
+    def __init__(
+        self,
+        model: Optional[str] = None,
+        api_key: Optional[str] = None,
+        api_base: Optional[str] = None,
+        config: Optional[LLMConfig] = None,
+        system_prompt: Optional[str] = None,
+        extra_headers: Optional[Dict[str, str]] = None,
+        max_retries: Optional[int] = None,
+    ):
+        self._config = config or LLMConfig()
+        # Track whether the model was explicitly supplied (constructor or
+        # LLMConfig). Comparing self._model to _DEFAULT_MODEL is not enough:
+        # callers who pass --model anthropic/claude-sonnet-4 happen to match
+        # the default and would otherwise be misrouted to RunConfig.model_name
+        # (which defaults to "gpt-4", quietly sending every call to OpenAI's
+        # gpt-4 model, which is what broke the activity-core CUST-WP-0045
+        # canary on 2026-06-02).
+        self._explicit_model = model is not None or self._config.model is not None
+        self._model = model or self._config.model or _DEFAULT_MODEL
+        self._api_base = (api_base or self._config.api_base).rstrip("/")
+        self._system_prompt = system_prompt
+        self._extra_headers = extra_headers or {}
+        self._max_retries = max_retries if max_retries is not None else self._config.max_retries
+
+        root = find_project_root()
+        key_file_paths = [root / "apikey-openrouter.txt"] if root else []
+        self._api_key = resolve_api_key(
+            explicit=api_key or self._config.api_key,
+            env_var="OPENROUTER_API_KEY",
+            key_file_paths=key_file_paths,
+        )
+
+    # LLMAdapter interface
+
+    def execute_prompt(self, prompt: str, config: RunConfig) -> LLMResponse:
+        self._preflight_budget(config)
+        # Explicit constructor/LLMConfig model wins; only fall back to the
+        # per-call RunConfig.model_name when the adapter was not told what to
+        # use. RunConfig.model_name defaults to "gpt-4", so falling back
+        # unconditionally would silently misroute callers.
+        if self._explicit_model:
+            model = self._model
+        else:
+            model = config.model_name or self._model
+
+        messages: list[Dict[str, str]] = []
+        if self._system_prompt:
+            messages.append({"role": "system", "content": self._system_prompt})
+        messages.append({"role": "user", "content": prompt})
+
+        payload: Dict[str, Any] = {
+            "model": model,
+            "messages": messages,
+            "temperature": config.temperature,
+            "max_tokens": config.max_tokens,
+        }
+        if config.model_params:
+            merge_openai_chat_model_params(payload, config.model_params)
+
+        headers = {
+            "Authorization": f"Bearer {self._api_key}",
+            **self._extra_headers,
+        }
+        url = f"{self._api_base}/chat/completions"
+
+        start = time.time()
+        data = self._post_with_retries(url, payload, headers, config.timeout_seconds)
+        latency = time.time() - start
+
+        choice = data.get("choices", [{}])[0]
+        content = choice.get("message", {}).get("content", "")
+        finish_reason = choice.get("finish_reason", "stop")
+        usage = data.get("usage", {})
+
+        response = LLMResponse(
+            content=content,
+            model=data.get("model", model),
+            usage={
+                "prompt_tokens": usage.get("prompt_tokens", 0),
+                "completion_tokens": usage.get("completion_tokens", 0),
+                "total_tokens": usage.get("total_tokens", 0),
+            },
+            finish_reason=finish_reason,
+            metadata={
+                "provider": "openrouter",
+                "latency_seconds": round(latency, 3),
+                "response_id": data.get("id", ""),
+            },
+        )
+        self._consume_budget(config, response)
+        return response
+
+    def validate_config(self, config: RunConfig) -> bool:
+        if not self._api_key:
+            return False
+        if not (self._model or config.model_name):
+            return False
+        if not (0.0 <= config.temperature <= 2.0):
+            return False
+        return True
+
+    # Internals
+
+    def _post_with_retries(
+        self,
+        url: str,
+        payload: Dict[str, Any],
+        headers: Dict[str, str],
+        timeout: int,
+    ) -> Dict[str, Any]:
+        last_exc: Optional[Exception] = None
+        for attempt in range(self._max_retries + 1):
+            try:
+                return post_json(url, payload, headers, timeout=timeout)
+            except LLMRateLimitError as exc:
+                last_exc = exc
+                if attempt < self._max_retries:
+                    time.sleep(2 ** attempt)
+            except LLMAPIError as exc:
+                if exc.status_code >= 500 and attempt < self._max_retries:
+                    last_exc = exc
+                    time.sleep(2 ** attempt)
+                else:
+                    raise
+        raise last_exc  # type: ignore[misc]
diff --git a/llm_connect/replay.py b/llm_connect/replay.py
new file mode 100644
index 0000000..082c122
--- /dev/null
+++ b/llm_connect/replay.py
@@ -0,0 +1,121 @@
+"""Replay llm-connect audit records without making provider calls."""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+
+from llm_connect.claude_code import _unwrap_cli_json_envelope
+from llm_connect.models import RunConfig
+
+
+def parse_audit_record(record: dict[str, Any]) -> dict[str, Any]:
+    """Parse the recorded provider response and compare it to saved content."""
+
+    config = RunConfig.from_dict(record.get("config", {}))
+    provider = record.get("provider") or _infer_provider(record)
+    provider_response = record.get("provider_response") or {}
+    body = provider_response.get("body")
+    parsed_content = _parse_provider_response(provider, body, config)
+    recorded_content = record.get("parsed_content")
+    schema_check = _check_structured_output(parsed_content, config.model_params.get("json_schema"))
+
+    return {
+        "provider": provider,
+        "parsed_content": parsed_content,
+        "matches_recorded_content": parsed_content == recorded_content,
+        "structured_output": schema_check,
+    }
+
+
+def main(argv: list[str] | None = None) -> None:
+    parser = argparse.ArgumentParser(
+        prog="python -m llm_connect.replay",
+        description="Replay parsing for a llm-connect audit JSON file.",
+    )
+    parser.add_argument("audit_file", help="Path to an audit JSON file")
+    parser.add_argument("--json", action="store_true", help="Print the full replay report")
+    args = parser.parse_args(argv)
+
+    record = json.loads(Path(args.audit_file).read_text(encoding="utf-8"))
+    report = parse_audit_record(record)
+    if args.json:
+        print(json.dumps(report, indent=2, sort_keys=True))
+    else:
+        print(report["parsed_content"])
+
+
+def _parse_provider_response(provider: str | None, body: Any, config: RunConfig) -> str:
+    if provider in {"openai", "openrouter"}:
+        if isinstance(body, dict):
+            choice = (body.get("choices") or [{}])[0]
+            return choice.get("message", {}).get("content", "")
+        return ""
+
+    if provider == "gemini":
+        if isinstance(body, dict):
+            candidates = body.get("candidates") or []
+            if not candidates:
+                return ""
+            parts = candidates[0].get("content", {}).get("parts", [])
+            return "".join(part.get("text", "") for part in parts)
+        return ""
+
+    if provider == "claude-code":
+        if isinstance(body, dict):
+            return _unwrap_cli_json_envelope(body.get("stdout", ""), config)
+        return ""
+
+    if isinstance(body, str):
+        return body
+    if body is None:
+        return ""
+    return json.dumps(body)
+
+
+def _infer_provider(record: dict[str, Any]) -> str | None:
+    request = record.get("provider_request") or {}
+    url = request.get("url", "")
+    if "openrouter.ai" in url:
+        return "openrouter"
+    if "api.openai.com" in url:
+        return "openai"
+    if "generativelanguage.googleapis.com" in url:
+        return "gemini"
+    if request.get("command"):
+        return "claude-code"
+    return None
+
+
+def _check_structured_output(content: str, schema: Any) -> dict[str, Any]:
+    if not schema:
+        return {"checked": False}
+    if isinstance(schema, str):
+        try:
+            schema = json.loads(schema)
+        except ValueError as exc:
+            return {"checked": True, "valid": False, "error": f"invalid schema JSON: {exc}"}
+    if not isinstance(schema, dict):
+        return {"checked": True, "valid": False, "error": "schema must be an object"}
+
+    try:
+        parsed = json.loads(content)
+    except ValueError as exc:
+        return {"checked": True, "valid": False, "error": f"invalid output JSON: {exc}"}
+
+    missing = []
+    if schema.get("type") == "object":
+        if not isinstance(parsed, dict):
+            return {"checked": True, "valid": False, "error": "output is not an object"}
+        for key in schema.get("required", []):
+            if key not in parsed:
+                missing.append(key)
+    if missing:
+        return {"checked": True, "valid": False, "missing_required": missing}
+    return {"checked": True, "valid": True}
+
+
+if __name__ == "__main__":
+    main()
diff --git a/llm_connect/server.py b/llm_connect/server.py
index d7f1093..93f52a4 100644
--- a/llm_connect/server.py
+++ b/llm_connect/server.py
@@ -21,13 +21,21 @@ Usage (CLI)::
 """
 
 import argparse
+import datetime as _dt
 import json
+import os
+import re
 import threading
-from http.server import BaseHTTPRequestHandler, HTTPServer
+import time
+import uuid
+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
+from pathlib import Path
 from typing import Optional
+from urllib.parse import parse_qs, urlsplit
 
+from llm_connect._diagnostics import capture_diagnostics
 from llm_connect.adapter import LLMAdapter
-from llm_connect.models import RunConfig
+from llm_connect.models import LLMResponse, RunConfig
 
 
 class _Handler(BaseHTTPRequestHandler):
@@ -39,7 +47,8 @@ class _Handler(BaseHTTPRequestHandler):
     # ── GET ────────────────────────────────────────────────────────
 
     def do_GET(self):
-        if self.path == "/health":
+        parsed = urlsplit(self.path)
+        if parsed.path == "/health":
             self._respond(200, {"status": "ok"})
         else:
             self._respond(404, {"error": "not found"})
@@ -47,10 +56,13 @@ class _Handler(BaseHTTPRequestHandler):
     # ── POST ───────────────────────────────────────────────────────
 
     def do_POST(self):
-        if self.path != "/execute":
+        parsed = urlsplit(self.path)
+        if parsed.path != "/execute":
             self._respond(404, {"error": "not found"})
             return
 
+        debug_enabled = _debug_requested(parsed.query)
+        audit_dir = os.environ.get("LLM_CONNECT_AUDIT_DIR")
         length = int(self.headers.get("Content-Length", 0))
         raw = self.rfile.read(length)
         try:
@@ -70,9 +82,19 @@ class _Handler(BaseHTTPRequestHandler):
             return
         config = RunConfig.from_dict(cfg)
 
+        start = time.time()
+        diagnostics_enabled = debug_enabled or bool(audit_dir)
         try:
-            response = self.server.adapter.execute_prompt(prompt, config)  # type: ignore[attr-defined]
-            self._respond(200, response.to_dict())
+            with capture_diagnostics(diagnostics_enabled) as diagnostics:
+                response = self.server.adapter.execute_prompt(prompt, config)  # type: ignore[attr-defined]
+            latency = time.time() - start
+            body = response.to_dict()
+            debug = diagnostics.to_dict() if diagnostics is not None else None
+            if debug_enabled and debug is not None:
+                body["debug"] = debug
+            if audit_dir:
+                _write_audit_record(audit_dir, prompt, config, response, debug, latency)
+            self._respond(200, body)
         except Exception as exc:
             self._respond(500, {"error": str(exc)})
 
@@ -102,7 +124,7 @@ class LLMServer:
         host: str = "127.0.0.1",
         port: int = 8080,
     ) -> None:
-        self._httpd = HTTPServer((host, port), _Handler)
+        self._httpd = ThreadingHTTPServer((host, port), _Handler)
         self._httpd.adapter = adapter  # type: ignore[attr-defined]
         self._thread: Optional[threading.Thread] = None
 
@@ -138,6 +160,55 @@ def _build_adapter(provider: str, model: Optional[str]) -> LLMAdapter:
     return create_adapter(provider, model=model)
 
 
+def _debug_requested(query: str) -> bool:
+    env = os.environ.get("LLM_CONNECT_DEBUG", "")
+    if _truthy(env):
+        return True
+    values = parse_qs(query).get("debug", [])
+    return any(_truthy(value) for value in values)
+
+
+def _truthy(value: str) -> bool:
+    return value.strip().lower() in {"1", "true", "yes", "on"}
+
+
+def _write_audit_record(
+    audit_dir: str,
+    prompt: str,
+    config: RunConfig,
+    response: LLMResponse,
+    debug: dict | None,
+    latency_seconds: float,
+) -> None:
+    target_dir = Path(audit_dir)
+    target_dir.mkdir(parents=True, exist_ok=True)
+
+    now = _dt.datetime.now(_dt.timezone.utc)
+    response_id = str(response.metadata.get("response_id") or uuid.uuid4().hex)
+    filename = f"{now.strftime('%Y%m%dT%H%M%S%fZ')}-{_safe_filename(response_id)}.json"
+    diagnostics = debug or {}
+    record = {
+        "timestamp": now.isoformat().replace("+00:00", "Z"),
+        "prompt": prompt,
+        "config": config.to_dict(),
+        "provider": response.metadata.get("provider"),
+        "provider_request": diagnostics.get("provider_request"),
+        "provider_response": diagnostics.get("provider_response"),
+        "adapter_transformations": diagnostics.get("adapter_transformations", []),
+        "parsed_content": response.content,
+        "latency_seconds": round(latency_seconds, 3),
+        "response": response.to_dict(),
+    }
+    (target_dir / filename).write_text(
+        json.dumps(record, indent=2, sort_keys=True),
+        encoding="utf-8",
+    )
+
+
+def _safe_filename(value: str) -> str:
+    return re.sub(r"[^A-Za-z0-9_.-]+", "-", value).strip("-") or "response"
+
+
 def main(argv=None) -> None:
     parser = argparse.ArgumentParser(
         prog="python -m llm_connect.server",
diff --git a/tests/test_payload.py b/tests/test_payload.py
new file mode 100644
index 0000000..4ecc934
--- /dev/null
+++ b/tests/test_payload.py
@@ -0,0 +1,81 @@
+from llm_connect._payload import merge_gemini_model_params, merge_openai_chat_model_params
+
+
+STRUCTURED_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "summary": {"type": "string"},
+        "recommendations": {"type": "array", "items": {"type": "string"}},
+    },
+    "required": ["summary", "recommendations"],
+}
+
+
+ACTIVITY_CORE_MODEL_PARAMS = {
+    "reasoning_effort": "medium",
+    "max_depth": 4,
+    "json_schema": STRUCTURED_SCHEMA,
+    "top_p": 0.8,
+}
+
+
+def test_openai_chat_model_params_translate_activity_core_shape():
+    payload = {
+        "model": "gpt-4.1-mini",
+        "messages": [{"role": "user", "content": "triage"}],
+        "temperature": 0.2,
+        "max_tokens": 200,
+    }
+
+    merge_openai_chat_model_params(payload, ACTIVITY_CORE_MODEL_PARAMS)
+
+    assert payload["response_format"] == {
+        "type": "json_schema",
+        "json_schema": {
+            "name": "structured_output",
+            "schema": STRUCTURED_SCHEMA,
+            "strict": False,
+        },
+    }
+    assert payload["top_p"] == 0.8
+    assert "reasoning_effort" not in payload
+    assert "max_depth" not in payload
+    assert "json_schema" not in payload
+
+
+def test_openai_chat_model_params_preserve_explicit_response_format():
+    explicit = {
+        "type": "json_schema",
+        "json_schema": {
+            "name": "custom",
+            "schema": STRUCTURED_SCHEMA,
+            "strict": True,
+        },
+    }
+    payload = {"model": "gpt-4.1-mini", "messages": []}
+
+    merge_openai_chat_model_params(
+        payload,
+        {"json_schema": STRUCTURED_SCHEMA, "response_format": explicit},
+    )
+
+    assert payload["response_format"] == explicit
+
+
+def test_gemini_model_params_translate_activity_core_shape():
+    payload = {
+        "contents": [{"role": "user", "parts": [{"text": "triage"}]}],
+        "generationConfig": {
+            "temperature": 0.2,
+            "maxOutputTokens": 200,
+        },
+    }
+
+    merge_gemini_model_params(payload, ACTIVITY_CORE_MODEL_PARAMS)
+
+    assert payload["generationConfig"]["responseMimeType"] == "application/json"
+    assert payload["generationConfig"]["responseSchema"] == STRUCTURED_SCHEMA
+    assert payload["generationConfig"]["topP"] == 0.8
+    assert "reasoning_effort" not in payload
+    assert "max_depth" not in payload
+    assert "json_schema" not in payload
diff --git a/tests/test_replay.py b/tests/test_replay.py
new file mode 100644
index 0000000..a907f46
--- /dev/null
+++ b/tests/test_replay.py
@@ -0,0 +1,62 @@
+from llm_connect.replay import parse_audit_record
+
+
+STRUCTURED_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "summary": {"type": "string"},
+        "recommendations": {"type": "array", "items": {"type": "string"}},
+    },
+    "required": ["summary", "recommendations"],
+}
+
+
+def test_replay_parses_openai_style_provider_response():
+    record = {
+        "provider": "openrouter",
+        "config": {"model_params": {"json_schema": STRUCTURED_SCHEMA}},
+        "provider_response": {
+            "status": 200,
+            "body": {
+                "choices": [
+                    {
+                        "message": {
+                            "content": '{"summary":"ok","recommendations":[]}'
+                        }
+                    }
+                ]
+            },
+        },
+        "parsed_content": '{"summary":"ok","recommendations":[]}',
+    }
+
+    report = parse_audit_record(record)
+
+    assert report["parsed_content"] == '{"summary":"ok","recommendations":[]}'
+    assert report["matches_recorded_content"] is True
+    assert report["structured_output"] == {"checked": True, "valid": True}
+
+
+def test_replay_reuses_claude_code_envelope_unwrapper():
+    record = {
+        "provider": "claude-code",
+        "config": {"model_params": {"json_schema": STRUCTURED_SCHEMA}},
+        "provider_response": {
+            "status": 0,
+            "body": {
+                "stdout": (
+                    '{"type":"result","result":"prose",'
+                    '"structured_result":"{\\"summary\\":\\"ok\\",'
+                    '\\"recommendations\\":[]}"}'
+                ),
+                "stderr": "",
+            },
+        },
+        "parsed_content": '{"summary":"ok","recommendations":[]}',
+    }
+
+    report = parse_audit_record(record)
+
+    assert report["parsed_content"] == '{"summary":"ok","recommendations":[]}'
+    assert report["matches_recorded_content"] is True
+    assert report["structured_output"] == {"checked": True, "valid": True}
diff --git a/tests/test_server.py b/tests/test_server.py
index f417739..ac4e1a9 100644
--- a/tests/test_server.py
+++ b/tests/test_server.py
@@ -2,14 +2,22 @@
 Tests for LLMServer HTTP serve mode (FR-1).
 """
 
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor
 import json
 import urllib.error
 import urllib.request
 
 import pytest
 
+from llm_connect._diagnostics import (
+    record_adapter_transformation,
+    record_provider_request,
+    record_provider_response,
+)
 from llm_connect.adapter import MockLLMAdapter, ErrorLLMAdapter
-from llm_connect.models import RunConfig
+from llm_connect.models import LLMResponse, RunConfig
 from llm_connect.server import LLMServer
 
 
@@ -45,6 +53,35 @@ def _post(url: str, body: dict) -> tuple[int, dict]:
         return exc.code, json.loads(exc.read())
 
 
+class DiagnosticLLMAdapter(MockLLMAdapter):
+    def execute_prompt(self, prompt: str, config: RunConfig) -> LLMResponse:
+        record_provider_request(
+            url="https://provider.example/v1/chat",
+            payload={"prompt": prompt, "model": config.model_name},
+            headers={"Authorization": "Bearer secret-token"},
+        )
+        response = super().execute_prompt(prompt, config)
+        response.metadata["provider"] = "diagnostic"
+        response.metadata["response_id"] = "diag-response"
+        record_provider_response(status=200, body={"id": "diag-response", "content": response.content})
+        record_adapter_transformation(
+            "diagnostic_transform",
+            {"before": prompt},
+            {"after": response.content},
+        )
+        return response
+
+
+class BarrierLLMAdapter(MockLLMAdapter):
+    def __init__(self):
+        super().__init__(mock_response="parallel")
+        self._barrier = threading.Barrier(2)
+
+    def execute_prompt(self, prompt: str, config: RunConfig) -> LLMResponse:
+        self._barrier.wait(timeout=2.0)
+        return super().execute_prompt(prompt, config)
+
+
 class TestHealth:
     def test_health_returns_200(self, server):
         status, body = _get(f"http://127.0.0.1:{server.port}/health")
@@ -65,6 +102,7 @@ class TestExecute:
         assert status == 200
         assert body["content"] == "hello world"
         assert body["finish_reason"] == "stop"
+        assert "debug" not in body
 
     def test_response_includes_usage(self, server):
         status, body = _post(
@@ -150,3 +188,86 @@ class TestExecute:
         )
         assert status == 400
         assert "config" in body["error"]
+
+    def test_debug_query_returns_diagnostics(self):
+        s = LLMServer(adapter=DiagnosticLLMAdapter(mock_response="debug body"), port=0)
+        s.start()
+        try:
+            status, body = _post(
+                f"http://127.0.0.1:{s.port}/execute?debug=1",
+                {"prompt": "inspect", "config": {"model_name": "diagnostic-model"}},
+            )
+        finally:
+            s.stop()
+
+        assert status == 200
+        assert body["content"] == "debug body"
+        debug = body["debug"]
+        assert debug["provider_request"]["payload"] == {
+            "prompt": "inspect",
+            "model": "diagnostic-model",
+        }
+        assert debug["provider_request"]["headers_redacted"]["Authorization"] == "Bearer <redacted>"
+        assert debug["provider_response"]["status"] == 200
+        assert debug["adapter_transformations"][0]["step"] == "diagnostic_transform"
+
+    def test_debug_env_returns_diagnostics(self, monkeypatch):
+        monkeypatch.setenv("LLM_CONNECT_DEBUG", "1")
+        s = LLMServer(adapter=DiagnosticLLMAdapter(mock_response="debug body"), port=0)
+        s.start()
+        try:
+            status, body = _post(
+                f"http://127.0.0.1:{s.port}/execute",
+                {"prompt": "inspect"},
+            )
+        finally:
+            s.stop()
+
+        assert status == 200
+        assert "debug" in body
+
+    def test_audit_dir_records_replayable_call(self, monkeypatch, tmp_path):
+        monkeypatch.setenv("LLM_CONNECT_AUDIT_DIR", str(tmp_path))
+        s = LLMServer(adapter=DiagnosticLLMAdapter(mock_response="audit body"), port=0)
+        s.start()
+        try:
+            status, body = _post(
+                f"http://127.0.0.1:{s.port}/execute",
+                {"prompt": "audit me", "config": {"model_name": "audit-model"}},
+            )
+        finally:
+            s.stop()
+
+        assert status == 200
+        assert "debug" not in body
+        files = list(tmp_path.glob("*.json"))
+        assert len(files) == 1
+        record = json.loads(files[0].read_text(encoding="utf-8"))
+        assert record["prompt"] == "audit me"
+        assert record["config"]["model_name"] == "audit-model"
+        assert record["parsed_content"] == "audit body"
+        assert record["provider_request"]["headers_redacted"]["Authorization"] == "Bearer <redacted>"
+        assert record["provider_response"]["body"]["id"] == "diag-response"
+        assert record["latency_seconds"] >= 0
+
+    def test_execute_requests_run_concurrently(self):
+        s = LLMServer(adapter=BarrierLLMAdapter(), port=0)
+        s.start()
+        try:
+            start = time.monotonic()
+            with ThreadPoolExecutor(max_workers=2) as pool:
+                futures = [
+                    pool.submit(
+                        _post,
+                        f"http://127.0.0.1:{s.port}/execute",
+                        {"prompt": f"request {idx}"},
+                    )
+                    for idx in range(2)
+                ]
+                results = [future.result(timeout=3.0) for future in futures]
+            elapsed = time.monotonic() - start
+        finally:
+            s.stop()
+
+        assert [status for status, _body in results] == [200, 200]
+        assert elapsed < 1.5
diff --git a/tests/test_structured_output_smoke.py b/tests/test_structured_output_smoke.py
new file mode 100644
index 0000000..aeb0395
--- /dev/null
+++ b/tests/test_structured_output_smoke.py
@@ -0,0 +1,142 @@
+import json
+
+from llm_connect.gemini import GeminiAdapter
+from llm_connect.models import RunConfig
+from llm_connect.openai import OpenAIAdapter
+from llm_connect.openrouter import OpenRouterAdapter
+
+
+STRUCTURED_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "summary": {"type": "string"},
+        "recommendations": {"type": "array", "items": {"type": "string"}},
+    },
+    "required": ["summary", "recommendations"],
+}
+
+
+SMOKE_CONFIG = RunConfig(
+    model_name="gpt-4",
+    temperature=0.1,
+    max_tokens=300,
+    model_params={
+        "reasoning_effort": "medium",
+        "max_depth": 3,
+        "json_schema": STRUCTURED_SCHEMA,
+    },
+)
+
+
+def test_openrouter_structured_output_payload_and_model_routing(monkeypatch):
+    captured: dict[str, object] = {}
+
+    def fake_post_json(url, payload, headers=None, timeout=300):  # noqa: ANN001
+        captured["url"] = url
+        captured["payload"] = payload
+        captured["headers"] = headers
+        captured["timeout"] = timeout
+        return {
+            "id": "or-response",
+            "model": payload["model"],
+            "choices": [
+                {
+                    "message": {
+                        "content": json.dumps(
+                            {"summary": "ok", "recommendations": ["keep payload clean"]}
+                        )
+                    },
+                    "finish_reason": "stop",
+                }
+            ],
+            "usage": {"prompt_tokens": 1, "completion_tokens": 2, "total_tokens": 3},
+        }
+
+    monkeypatch.setattr("llm_connect.openrouter.post_json", fake_post_json)
+    adapter = OpenRouterAdapter(
+        model="anthropic/claude-sonnet-4",
+        api_key="or-test",
+        api_base="https://openrouter.example/api/v1",
+    )
+
+    response = adapter.execute_prompt("Return JSON.", SMOKE_CONFIG)
+    payload = captured["payload"]
+
+    assert response.model == "anthropic/claude-sonnet-4"
+    assert payload["model"] == "anthropic/claude-sonnet-4"
+    assert payload["response_format"]["json_schema"]["schema"] == STRUCTURED_SCHEMA
+    assert payload["response_format"]["json_schema"]["strict"] is False
+    assert "reasoning_effort" not in payload
+    assert "max_depth" not in payload
+    assert "json_schema" not in payload
+
+
+def test_openai_structured_output_payload(monkeypatch):
+    captured: dict[str, object] = {}
+
+    def fake_post_json(url, payload, headers=None, timeout=300):  # noqa: ANN001
+        captured["payload"] = payload
+        return {
+            "id": "oa-response",
+            "model": payload["model"],
+            "choices": [
+                {
+                    "message": {
+                        "content": json.dumps({"summary": "ok", "recommendations": []})
+                    },
+                    "finish_reason": "stop",
+                }
+            ],
+            "usage": {"prompt_tokens": 1, "completion_tokens": 2, "total_tokens": 3},
+        }
+
+    monkeypatch.setattr("llm_connect.openai.post_json", fake_post_json)
+    adapter = OpenAIAdapter(model="gpt-4.1-mini", api_key="sk-test")
+
+    response = adapter.execute_prompt("Return JSON.", SMOKE_CONFIG)
+    payload = captured["payload"]
+
+    assert response.model == "gpt-4.1-mini"
+    assert payload["model"] == "gpt-4.1-mini"
+    assert payload["response_format"]["json_schema"]["schema"] == STRUCTURED_SCHEMA
+    assert "reasoning_effort" not in payload
+    assert "max_depth" not in payload
+    assert "json_schema" not in payload
+
+
+def test_gemini_structured_output_payload(monkeypatch):
+    captured: dict[str, object] = {}
+
+    def fake_post_json(url, payload, headers=None, timeout=300):  # noqa: ANN001
+        captured["url"] = url
+        captured["payload"] = payload
+        return {
+            "candidates": [
+                {
+                    "content": {
+                        "parts": [
+                            {"text": json.dumps({"summary": "ok", "recommendations": []})}
+                        ]
+                    },
+                    "finishReason": "STOP",
+                }
+            ],
+            "usageMetadata": {
+                "promptTokenCount": 1,
+                "candidatesTokenCount": 2,
+                "totalTokenCount": 3,
+            },
+        }
+
+    monkeypatch.setattr("llm_connect.gemini.post_json", fake_post_json)
+    adapter = GeminiAdapter(model="gemini-2.5-flash", api_key="gemini-test")
+
+    response = adapter.execute_prompt("Return JSON.", SMOKE_CONFIG)
+    payload = captured["payload"]
+
+    assert response.model == "gemini-2.5-flash"
+    assert payload["generationConfig"]["responseMimeType"] == "application/json"
+    assert payload["generationConfig"]["responseSchema"] == STRUCTURED_SCHEMA
+    assert "reasoning_effort" not in payload
+    assert "max_depth" not in payload
+    assert "json_schema" not in payload
diff --git a/workplans/ADHOC-2026-06-02.md b/workplans/ADHOC-2026-06-02.md
index 9304348..50b2918 100644
--- a/workplans/ADHOC-2026-06-02.md
+++ b/workplans/ADHOC-2026-06-02.md
@@ -4,11 +4,11 @@ type: workplan
 title: "Ad hoc — llm-connect lessons from CUST-WP-0045 canary"
 domain: custodian
 repo: llm-connect
-status: ready
+status: finished
 owner: custodian
 topic_slug: custodian
 created: "2026-06-02"
-updated: "2026-06-02"
+updated: "2026-06-03"
 state_hub_workstream_id: "1c936c91-79c7-427d-ab37-9052e8a61cda"
 ---
 
@@ -38,7 +38,7 @@ workplan.
 
 ```task
 id: ADHOC-2026-06-02-T01
-status: todo
+status: done
 priority: medium
 state_hub_task_id: "69626e9e-29f1-40f6-8cd2-d38a7e802293"
 ```
@@ -78,7 +78,7 @@ debug field is omitted in normal mode.
 
 ```task
 id: ADHOC-2026-06-02-T02
-status: todo
+status: done
 priority: low
 state_hub_task_id: "e2b1be30-71f7-4497-9b10-b0f24d37beba"
 ```
@@ -101,7 +101,7 @@ max of their individual latencies, not the sum.
 
 ```task
 id: ADHOC-2026-06-02-T03
-status: todo
+status: done
 priority: medium
 state_hub_task_id: "da4821f0-a876-44ce-9dc3-f3fc67732d0f"
 ```
@@ -127,7 +127,7 @@ ergonomics.
 
 ```task
 id: ADHOC-2026-06-02-T04
-status: todo
+status: done
 priority: medium
 state_hub_task_id: "f8a033e6-22ac-4700-b8d2-43a5d76a3751"
 ```
@@ -155,7 +155,7 @@ forbidden top-level fields, schema in the right wrapper).
 
 ```task
 id: ADHOC-2026-06-02-T05
-status: todo
+status: done
 priority: medium
 state_hub_task_id: "5d53dbb4-b374-45fe-b81c-ff0b222ca74f"
 ```
@@ -188,7 +188,7 @@ bug) before either was merged.
 
 ```task
 id: ADHOC-2026-06-02-T06
-status: todo
+status: done
 priority: low
 state_hub_task_id: "33fcb951-d7ab-4d3c-8d67-9eebd986c711"
 ```
@@ -210,3 +210,21 @@ would only send OpenAI-valid fields. Codify the contract in
 
 Done when a new adapter author can read the doc and know what their
 `_merge_model_params` implementation must support.
+
+## Implementation Notes
+
+Completed on 2026-06-03:
+
+- Added opt-in `/execute` debug envelopes via `LLM_CONNECT_DEBUG=1` or
+  `?debug=1`, with redacted provider request/response capture and adapter
+  transformation records.
+- Switched serve mode to `ThreadingHTTPServer` and added a concurrency
+  regression test.
+- Added `LLM_CONNECT_AUDIT_DIR` per-call audit records plus
+  `python -m llm_connect.replay` for parser/unwrapper replay.
+- Extracted shared OpenAI-compatible and Gemini payload translation helpers
+  and wired OpenRouter, OpenAI, and Gemini through them.
+- Added CI-safe structured-output smoke tests that mock provider HTTP calls
+  and assert model routing plus payload shape.
+- Documented the adapter `model_params` contract in
+  `docs/adapter-model-params.md`.