From 24f4c09d428e8be95ebb113b2d8a4ef7eb67bff6 Mon Sep 17 00:00:00 2001 From: tegwick Date: Wed, 3 Jun 2026 11:56:21 +0200 Subject: [PATCH] Implement llm-connect ADHOC diagnostics --- ARCHITECTURE-LAYERS.md | 3 + README.md | 38 +- docs/adapter-model-params.md | 102 +++++ llm_connect/_diagnostics.py | 153 +++++++ llm_connect/_http.py | 187 +++++---- llm_connect/_payload.py | 154 +++++++ llm_connect/claude_code.py | 566 +++++++++++++------------- llm_connect/gemini.py | 3 + llm_connect/openai.py | 3 + llm_connect/openrouter.py | 372 +++++++---------- llm_connect/replay.py | 121 ++++++ llm_connect/server.py | 85 +++- tests/test_payload.py | 81 ++++ tests/test_replay.py | 62 +++ tests/test_server.py | 123 +++++- tests/test_structured_output_smoke.py | 142 +++++++ workplans/ADHOC-2026-06-02.md | 34 +- 17 files changed, 1618 insertions(+), 611 deletions(-) create mode 100644 docs/adapter-model-params.md create mode 100644 llm_connect/_diagnostics.py create mode 100644 llm_connect/_payload.py create mode 100644 llm_connect/replay.py create mode 100644 tests/test_payload.py create mode 100644 tests/test_replay.py create mode 100644 tests/test_structured_output_smoke.py diff --git a/ARCHITECTURE-LAYERS.md b/ARCHITECTURE-LAYERS.md index 1c6a3ee..c954236 100644 --- a/ARCHITECTURE-LAYERS.md +++ b/ARCHITECTURE-LAYERS.md @@ -32,6 +32,9 @@ Maturity states: **Experimental → Beta → Stable → Deprecated** | `gemini.py` | `GeminiAdapter` — Google Generative Language API | Beta | | `openrouter.py` | `OpenRouterAdapter` — OpenAI-compatible multi-model routing | Beta | | `claude_code.py` | `ClaudeCodeAdapter` — `claude --print` subprocess | Beta | +| `_payload.py` | Shared adapter payload translation for `RunConfig.model_params` | Beta | +| `_diagnostics.py` | Opt-in per-call diagnostics capture for server debug and audit modes | Beta | +| `replay.py` | Audit replay parser CLI (`python -m llm_connect.replay`) | Beta | | `embedding_adapter.py` | `EmbeddingAdapter` ABC | Beta | | `embedding_openai.py` | `OpenAICompatibleEmbeddingAdapter` | Beta | | `embedding_cache.py` | `EmbeddingCache` — disk-backed embedding cache | Beta | diff --git a/README.md b/README.md index 1ac1514..0fd8690 100644 --- a/README.md +++ b/README.md @@ -73,15 +73,15 @@ config = RunConfig( ) ``` -| Field | Default | Description | -|---|---|---| -| `model_name` | `"gpt-4"` | Model identifier (adapter may override) | -| `temperature` | `0.7` | Sampling temperature | -| `max_tokens` | `2000` | Maximum output tokens | -| `model_params` | `{}` | Extra provider-specific parameters | -| `max_depth` | `3` | Max nesting depth for recursive calls | -| `skip_if_exists` | `True` | Skip if identical input hash already processed | -| `timeout_seconds` | `300` | Request timeout | +| Field | Default | Description | +|---|---|---| +| `model_name` | `"gpt-4"` | Model identifier (adapter may override) | +| `temperature` | `0.7` | Sampling temperature | +| `max_tokens` | `2000` | Maximum output tokens | +| `model_params` | `{}` | Portable extras translated by each adapter; see `docs/adapter-model-params.md` | +| `max_depth` | `3` | Max nesting depth for recursive calls | +| `skip_if_exists` | `True` | Skip if identical input hash already processed | +| `timeout_seconds` | `300` | Request timeout | ### `LLMResponse` @@ -92,8 +92,24 @@ response = adapter.execute_prompt(prompt, config) print(response.content) # generated text print(response.model) # model actually used print(response.usage) # {"prompt_tokens": …, "completion_tokens": …, "total_tokens": …} -print(response.finish_reason) # "stop", "length", etc. -``` +print(response.finish_reason) # "stop", "length", etc. +``` + +## Server diagnostics + +Serve mode can include a debug envelope without changing normal responses: + +```bash +LLM_CONNECT_DEBUG=1 python -m llm_connect.server --provider openrouter +curl 'http://127.0.0.1:8080/execute?debug=1' -d '{"prompt":"hi"}' +``` + +Set `LLM_CONNECT_AUDIT_DIR=/path/to/audit` to write per-call replay records, +then parse one without another provider call: + +```bash +python -m llm_connect.replay /path/to/audit/record.json --json +``` ## Writing your own adapter diff --git a/docs/adapter-model-params.md b/docs/adapter-model-params.md new file mode 100644 index 0000000..61e8ee3 --- /dev/null +++ b/docs/adapter-model-params.md @@ -0,0 +1,102 @@ +# Adapter `model_params` contract + +`RunConfig.model_params` is a portability layer, not a blind provider payload +escape hatch. Adapters must translate the shared keys they understand, pass +through only provider-valid keys, and drop provider-specific keys that would +make another provider reject the request. + +## Shared structured output + +Callers may request structured output with: + +```python +RunConfig( + model_params={ + "json_schema": { + "type": "object", + "properties": { + "summary": {"type": "string"}, + "recommendations": {"type": "array", "items": {"type": "string"}}, + }, + "required": ["summary", "recommendations"], + } + } +) +``` + +Adapters translate that key into the provider's native shape: + +| Adapter | Translation | +|---|---| +| OpenAI | `response_format = {"type": "json_schema", "json_schema": ...}` | +| OpenRouter | Same OpenAI-compatible `response_format` wrapper | +| Gemini | `generationConfig.responseMimeType = "application/json"` and `generationConfig.responseSchema = ...` | +| Claude Code CLI | `--json-schema ` plus `--output-format json`, then envelope unwrap | + +OpenAI-compatible adapters default `json_schema.strict` to `False`. Strict mode +requires schemas to meet provider-specific constraints such as +`additionalProperties: false` on object nodes and complete `required` lists. +Callers that need strict behavior can pass an explicit provider-native +`response_format` in `model_params`. + +## Pass-through keys + +OpenAI and OpenRouter pass through known Chat Completions fields: + +`top_p`, `n`, `stream`, `stop`, `presence_penalty`, `frequency_penalty`, +`logit_bias`, `user`, `seed`, `tools`, `tool_choice`, `response_format`, +`logprobs`, `top_logprobs`, and `parallel_tool_calls`. + +Gemini passes through valid `generateContent` top-level fields: + +`safetySettings`, `tools`, `toolConfig`, `systemInstruction`, and +`cachedContent`. + +Gemini also accepts generation config fields directly or via snake-case aliases: + +`candidateCount`, `candidate_count`, `stopSequences`, `stop_sequences`, +`maxOutputTokens`, `max_output_tokens`, `temperature`, `topP`, `top_p`, `topK`, +`top_k`, `responseMimeType`, `response_mime_type`, `responseSchema`, and +`response_schema`. + +## Dropped keys + +Adapters must drop keys that are meaningful to another adapter or to +llm-connect itself but invalid for the target provider. The current shared drop +set includes: + +`reasoning_effort`, `max_depth`, `claude_cli_path`, and raw `json_schema` after +translation. + +Unknown keys are ignored by default. This keeps activity-specific configs from +causing provider HTTP 400 errors when a caller switches providers. + +## Diagnostics and replay + +Server mode supports opt-in diagnostics for `/execute`: + +```bash +LLM_CONNECT_DEBUG=1 python -m llm_connect.server --provider openrouter +curl 'http://127.0.0.1:8080/execute?debug=1' -d '{"prompt":"hi"}' +``` + +Debug responses include a `debug` field with the redacted provider request, raw +provider response body, and adapter transformations such as `merge_model_params` +or `unwrap_cli_envelope`. Normal responses omit `debug`. + +Set `LLM_CONNECT_AUDIT_DIR=/path/to/audit` to write one JSON audit record per +`/execute` call. Audit records include the prompt, config, redacted provider +request, provider response, parsed content, and latency. Re-run parsing without +another provider call with: + +```bash +python -m llm_connect.replay /path/to/audit/record.json --json +``` + +## Server concurrency + +`llm_connect.server.LLMServer` uses `ThreadingHTTPServer`. Adapter instances +used in server mode must be safe to call concurrently. The bundled HTTP and +subprocess adapters keep per-call state local; custom adapters should avoid +mutating shared instance attributes during `execute_prompt` unless they use +their own locks. diff --git a/llm_connect/_diagnostics.py b/llm_connect/_diagnostics.py new file mode 100644 index 0000000..3215e87 --- /dev/null +++ b/llm_connect/_diagnostics.py @@ -0,0 +1,153 @@ +"""Per-call diagnostics capture for server debug and audit modes.""" + +from __future__ import annotations + +import copy +import json +from contextlib import contextmanager +from contextvars import ContextVar +from dataclasses import dataclass, field +from typing import Any, Iterator, Mapping +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit + + +_SECRET_QUERY_KEYS = {"key", "api_key", "apikey", "access_token", "token"} +_SECRET_HEADER_TOKENS = ("authorization", "api-key", "apikey", "token", "secret", "key") + + +@dataclass +class Diagnostics: + """Captured provider request/response details for one logical LLM call.""" + + provider_request: dict[str, Any] | None = None + provider_response: dict[str, Any] | None = None + adapter_transformations: list[dict[str, Any]] = field(default_factory=list) + + def to_dict(self) -> dict[str, Any]: + return { + "provider_request": self.provider_request, + "provider_response": self.provider_response, + "adapter_transformations": self.adapter_transformations, + } + + +_CURRENT: ContextVar[Diagnostics | None] = ContextVar( + "llm_connect_diagnostics", + default=None, +) + + +@contextmanager +def capture_diagnostics(enabled: bool = True) -> Iterator[Diagnostics | None]: + """Capture diagnostics within this context when *enabled* is true.""" + + if not enabled: + yield None + return + + diagnostics = Diagnostics() + token = _CURRENT.set(diagnostics) + try: + yield diagnostics + finally: + _CURRENT.reset(token) + + +def diagnostics_enabled() -> bool: + return _CURRENT.get() is not None + + +def current_diagnostics() -> Diagnostics | None: + return _CURRENT.get() + + +def record_provider_request( + *, + url: str | None = None, + payload: Any | None = None, + headers: Mapping[str, Any] | None = None, + command: list[str] | None = None, +) -> None: + diagnostics = _CURRENT.get() + if diagnostics is None: + return + + request: dict[str, Any] = {} + if url is not None: + request["url"] = redact_url(url) + if payload is not None: + request["payload"] = json_safe(payload) + if headers is not None: + request["headers_redacted"] = redact_headers(headers) + if command is not None: + request["command"] = list(command) + diagnostics.provider_request = request + + +def record_provider_response(*, status: int | None = None, body: Any | None = None) -> None: + diagnostics = _CURRENT.get() + if diagnostics is None: + return + + response: dict[str, Any] = {} + if status is not None: + response["status"] = status + if body is not None: + response["body"] = json_safe(body) + diagnostics.provider_response = response + + +def record_adapter_transformation(step: str, before: Any, after: Any) -> None: + diagnostics = _CURRENT.get() + if diagnostics is None: + return + + diagnostics.adapter_transformations.append( + { + "step": step, + "before": json_safe(before), + "after": json_safe(after), + } + ) + + +def json_safe(value: Any) -> Any: + """Return a JSON-serializable snapshot of *value* without mutating it.""" + + try: + return json.loads(json.dumps(value)) + except (TypeError, ValueError): + try: + return copy.deepcopy(value) + except Exception: + return repr(value) + + +def redact_headers(headers: Mapping[str, Any]) -> dict[str, Any]: + redacted: dict[str, Any] = {} + for key, value in headers.items(): + lowered = str(key).lower() + if any(token in lowered for token in _SECRET_HEADER_TOKENS): + redacted[str(key)] = _redact_header_value(value) + else: + redacted[str(key)] = json_safe(value) + return redacted + + +def redact_url(url: str) -> str: + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + if key.lower() in _SECRET_QUERY_KEYS: + query.append((key, "")) + else: + query.append((key, value)) + return urlunsplit((parts.scheme, parts.netloc, parts.path, urlencode(query), parts.fragment)) + + +def _redact_header_value(value: Any) -> str: + text = str(value) + if " " in text: + scheme = text.split(" ", 1)[0] + return f"{scheme} " + return "" diff --git a/llm_connect/_http.py b/llm_connect/_http.py index d9429dc..4d3f66a 100644 --- a/llm_connect/_http.py +++ b/llm_connect/_http.py @@ -1,86 +1,101 @@ -""" -Thin synchronous HTTP helper built on :mod:`urllib.request`. - -Translates HTTP errors into typed :mod:`markitect.llm.exceptions`. -""" - -import json -import urllib.request -import urllib.error -from typing import Dict, Any, Optional - -from llm_connect.exceptions import ( - LLMAPIError, - LLMRateLimitError, - LLMTimeoutError, -) - - -def post_json( - url: str, - payload: Dict[str, Any], - headers: Optional[Dict[str, str]] = None, - timeout: int = 300, -) -> Dict[str, Any]: - """POST *payload* as JSON and return the parsed response body. - - Raises: - LLMRateLimitError: on HTTP 429 - LLMAPIError: on other non-2xx responses - LLMTimeoutError: on socket / read timeout - """ - data = json.dumps(payload).encode() - req = urllib.request.Request( - url, - data=data, - headers={"Content-Type": "application/json", **(headers or {})}, - method="POST", - ) - - try: - with urllib.request.urlopen(req, timeout=timeout) as resp: - body = resp.read().decode() - try: - return json.loads(body) - except json.JSONDecodeError as exc: - preview = body[:300].replace("\n", "\\n") - raise LLMAPIError( - f"Invalid JSON response from {url}: {exc} — body preview: {preview!r}", - cause=exc, - ) from exc - except urllib.error.HTTPError as exc: - body = "" - try: - body = exc.read().decode() - except Exception: - pass - - if exc.code == 429: - raise LLMRateLimitError( - f"Rate limited (429) from {url}", - status_code=429, - response_body=body, - cause=exc, - ) from exc - - raise LLMAPIError( - f"HTTP {exc.code} from {url}", - status_code=exc.code, - response_body=body, - cause=exc, - ) from exc - except urllib.error.URLError as exc: - if "timed out" in str(exc.reason): - raise LLMTimeoutError( - f"Request to {url} timed out after {timeout}s", - cause=exc, - ) from exc - raise LLMAPIError( - f"URL error for {url}: {exc.reason}", - cause=exc, - ) from exc - except TimeoutError as exc: - raise LLMTimeoutError( - f"Request to {url} timed out after {timeout}s", - cause=exc, - ) from exc +""" +Thin synchronous HTTP helper built on :mod:`urllib.request`. + +Translates HTTP errors into typed :mod:`markitect.llm.exceptions`. +""" + +import json +import urllib.error +import urllib.request +from typing import Any, Dict, Optional + +from llm_connect._diagnostics import record_provider_request, record_provider_response +from llm_connect.exceptions import ( + LLMAPIError, + LLMRateLimitError, + LLMTimeoutError, +) + + +def post_json( + url: str, + payload: Dict[str, Any], + headers: Optional[Dict[str, str]] = None, + timeout: int = 300, +) -> Dict[str, Any]: + """POST *payload* as JSON and return the parsed response body. + + Raises: + LLMRateLimitError: on HTTP 429 + LLMAPIError: on other non-2xx responses + LLMTimeoutError: on socket / read timeout + """ + record_provider_request(url=url, payload=payload, headers=headers or {}) + data = json.dumps(payload).encode() + req = urllib.request.Request( + url, + data=data, + headers={"Content-Type": "application/json", **(headers or {})}, + method="POST", + ) + + try: + with urllib.request.urlopen(req, timeout=timeout) as resp: + body = resp.read().decode() + try: + parsed = json.loads(body) + record_provider_response(status=resp.status, body=parsed) + return parsed + except json.JSONDecodeError as exc: + record_provider_response(status=resp.status, body=body) + preview = body[:300].replace("\n", "\\n") + raise LLMAPIError( + f"Invalid JSON response from {url}: {exc} - body preview: {preview!r}", + cause=exc, + ) from exc + except urllib.error.HTTPError as exc: + body = "" + try: + body = exc.read().decode() + except Exception: + pass + record_provider_response(status=exc.code, body=_json_or_text(body)) + + if exc.code == 429: + raise LLMRateLimitError( + f"Rate limited (429) from {url}", + status_code=429, + response_body=body, + cause=exc, + ) from exc + + raise LLMAPIError( + f"HTTP {exc.code} from {url}", + status_code=exc.code, + response_body=body, + cause=exc, + ) from exc + except urllib.error.URLError as exc: + record_provider_response(body={"error": str(exc.reason)}) + if "timed out" in str(exc.reason): + raise LLMTimeoutError( + f"Request to {url} timed out after {timeout}s", + cause=exc, + ) from exc + raise LLMAPIError( + f"URL error for {url}: {exc.reason}", + cause=exc, + ) from exc + except TimeoutError as exc: + record_provider_response(body={"error": "timeout"}) + raise LLMTimeoutError( + f"Request to {url} timed out after {timeout}s", + cause=exc, + ) from exc + + +def _json_or_text(body: str) -> Any: + try: + return json.loads(body) + except (TypeError, ValueError): + return body diff --git a/llm_connect/_payload.py b/llm_connect/_payload.py new file mode 100644 index 0000000..74d5c75 --- /dev/null +++ b/llm_connect/_payload.py @@ -0,0 +1,154 @@ +"""Provider payload helpers for translating ``RunConfig.model_params``.""" + +from __future__ import annotations + +import json +from typing import Any + +from llm_connect._diagnostics import ( + diagnostics_enabled, + json_safe, + record_adapter_transformation, +) + + +# OpenAI Chat Completions fields that map straight through from model_params. +# Anything not in this set is provider-specific and must be either translated +# or dropped. Blind merges are deliberately avoided because OpenAI-compatible +# providers commonly reject unknown top-level fields with HTTP 400. +OPENAI_CHAT_PASSTHROUGH_FIELDS = frozenset( + { + "top_p", + "n", + "stream", + "stop", + "presence_penalty", + "frequency_penalty", + "logit_bias", + "user", + "seed", + "tools", + "tool_choice", + "response_format", + "logprobs", + "top_logprobs", + "parallel_tool_calls", + } +) + + +DROPPED_NON_OPENAI_FIELDS = frozenset( + { + "reasoning_effort", + "max_depth", + "claude_cli_path", + "json_schema", + } +) + + +GEMINI_TOP_LEVEL_FIELDS = frozenset( + { + "safetySettings", + "tools", + "toolConfig", + "systemInstruction", + "cachedContent", + } +) + + +GEMINI_GENERATION_CONFIG_FIELDS = frozenset( + { + "candidateCount", + "stopSequences", + "maxOutputTokens", + "temperature", + "topP", + "topK", + "responseMimeType", + "responseSchema", + } +) + + +GEMINI_GENERATION_CONFIG_ALIASES = { + "candidate_count": "candidateCount", + "stop_sequences": "stopSequences", + "max_output_tokens": "maxOutputTokens", + "top_p": "topP", + "top_k": "topK", + "response_mime_type": "responseMimeType", + "response_schema": "responseSchema", +} + + +def merge_openai_chat_model_params(payload: dict[str, Any], model_params: dict[str, Any]) -> None: + """Merge model_params into an OpenAI Chat Completions-style payload. + + Translates ``json_schema`` to ``response_format``, passes known OpenAI + fields through, and drops Claude/llm-connect-only knobs. + """ + + before = json_safe(payload) if diagnostics_enabled() else None + + schema = _coerce_json_schema(model_params.get("json_schema")) + caller_response_format = model_params.get("response_format") + if schema is not None and caller_response_format is None and "response_format" not in payload: + payload["response_format"] = { + "type": "json_schema", + "json_schema": { + "name": "structured_output", + "schema": schema, + "strict": False, + }, + } + + for key, value in model_params.items(): + if key in DROPPED_NON_OPENAI_FIELDS: + continue + if key in OPENAI_CHAT_PASSTHROUGH_FIELDS: + payload[key] = value + + if before is not None: + record_adapter_transformation("merge_model_params.openai_chat", before, payload) + + +def merge_gemini_model_params(payload: dict[str, Any], model_params: dict[str, Any]) -> None: + """Merge model_params into a Gemini ``generateContent`` payload.""" + + before = json_safe(payload) if diagnostics_enabled() else None + generation_config = payload.setdefault("generationConfig", {}) + + schema = _coerce_json_schema(model_params.get("json_schema")) + if schema is not None and "responseSchema" not in generation_config: + generation_config["responseMimeType"] = "application/json" + generation_config["responseSchema"] = schema + + explicit_generation_config = model_params.get("generationConfig") + if isinstance(explicit_generation_config, dict): + generation_config.update(explicit_generation_config) + + for key, value in model_params.items(): + if key in {"json_schema", "generationConfig", "reasoning_effort", "max_depth"}: + continue + if key in GEMINI_TOP_LEVEL_FIELDS: + payload[key] = value + continue + gemini_key = GEMINI_GENERATION_CONFIG_ALIASES.get(key, key) + if gemini_key in GEMINI_GENERATION_CONFIG_FIELDS: + generation_config[gemini_key] = value + + if before is not None: + record_adapter_transformation("merge_model_params.gemini", before, payload) + + +def _coerce_json_schema(schema: Any) -> dict[str, Any] | None: + if isinstance(schema, str): + try: + schema = json.loads(schema) + except (TypeError, ValueError): + return None + if isinstance(schema, dict): + return schema + return None diff --git a/llm_connect/claude_code.py b/llm_connect/claude_code.py index 6a9c08f..95e383c 100644 --- a/llm_connect/claude_code.py +++ b/llm_connect/claude_code.py @@ -1,277 +1,289 @@ -""" -Claude Code CLI adapter — runs the ``claude`` CLI as a subprocess. -""" - -import asyncio -import json -import os -import subprocess -from pathlib import Path -from typing import Optional - -from llm_connect.adapter import LLMAdapter -from llm_connect.models import RunConfig, LLMResponse -from llm_connect.config import LLMConfig -from llm_connect._token_estimator import estimate_tokens -from llm_connect.exceptions import ( - LLMSubprocessError, - LLMTimeoutError, -) - - -class ClaudeCodeAdapter(LLMAdapter): - """LLM adapter that shells out to the ``claude`` CLI with ``--print``. - - The compiled prompt is piped via **stdin** to avoid shell argument - length limits (compiled prompts can exceed 30 KB). - """ - - def __init__( - self, - cli_path: Optional[str] = None, - model: Optional[str] = None, - config: Optional[LLMConfig] = None, - ): - self._config = config or LLMConfig(provider="claude-code") - self._cli_path = cli_path or self._resolve_cli_path() - self._model = model - - # ── LLMAdapter interface ──────────────────────────────────────── - - def execute_prompt(self, prompt: str, config: RunConfig) -> LLMResponse: - self._preflight_budget(config) - cmd = self._build_command(config) - - timeout = config.timeout_seconds or self._config.timeout_seconds - - try: - result = subprocess.run( - cmd, - input=prompt, - capture_output=True, - text=True, - timeout=timeout, - ) - except subprocess.TimeoutExpired as exc: - raise LLMTimeoutError( - f"claude CLI timed out after {timeout}s", - cause=exc, - ) from exc - - if result.returncode != 0: - raise LLMSubprocessError( - f"claude CLI exited with code {result.returncode}", - return_code=result.returncode, - stderr=result.stderr, - ) - - content = _unwrap_cli_json_envelope(result.stdout, config) - prompt_tokens = estimate_tokens(prompt) - completion_tokens = estimate_tokens(content) - - response = LLMResponse( - content=content, - model=self._model or "claude-code-cli", - usage={ - "prompt_tokens": prompt_tokens, - "completion_tokens": completion_tokens, - "total_tokens": prompt_tokens + completion_tokens, - }, - finish_reason="stop", - metadata={ - "provider": "claude-code", - "cli_path": self._cli_path, - }, - ) - self._consume_budget(config, response) - return response - - async def async_execute_prompt(self, prompt: str, config: RunConfig) -> LLMResponse: - """Native async implementation using asyncio.create_subprocess_exec.""" - self._preflight_budget(config) - cmd = self._build_command(config) - - timeout = config.timeout_seconds or self._config.timeout_seconds - - try: - proc = await asyncio.create_subprocess_exec( - *cmd, - stdin=asyncio.subprocess.PIPE, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout_bytes, stderr_bytes = await asyncio.wait_for( - proc.communicate(input=prompt.encode()), - timeout=timeout, - ) - except asyncio.TimeoutError as exc: - raise LLMTimeoutError( - f"claude CLI timed out after {timeout}s", - cause=exc, - ) from exc - - if proc.returncode != 0: - raise LLMSubprocessError( - f"claude CLI exited with code {proc.returncode}", - return_code=proc.returncode, - stderr=stderr_bytes.decode(), - ) - - content = _unwrap_cli_json_envelope(stdout_bytes.decode(), config) - prompt_tokens = estimate_tokens(prompt) - completion_tokens = estimate_tokens(content) - - response = LLMResponse( - content=content, - model=self._model or "claude-code-cli", - usage={ - "prompt_tokens": prompt_tokens, - "completion_tokens": completion_tokens, - "total_tokens": prompt_tokens + completion_tokens, - }, - finish_reason="stop", - metadata={ - "provider": "claude-code", - "cli_path": self._cli_path, - "async": True, - }, - ) - self._consume_budget(config, response) - return response - - def validate_config(self, config: RunConfig) -> bool: - try: - result = subprocess.run( - [self._cli_path, "--version"], - capture_output=True, - text=True, - timeout=10, - ) - return result.returncode == 0 - except (subprocess.TimeoutExpired, FileNotFoundError, OSError): - return False - - def _build_command(self, config: RunConfig) -> list[str]: - cmd = [self._cli_path, "--print"] - if self._model: - cmd.extend(["--model", self._model]) - - json_schema = _json_schema_arg(config) - if json_schema: - cmd.extend(["--json-schema", json_schema]) - # With --json-schema alone the CLI prints conversational text on - # stdout while the structured payload ships on a sidecar channel - # callers cannot reach. --output-format json forces the structured - # response (wrapped in an envelope) onto stdout. - cmd.extend(["--output-format", "json"]) - return cmd - - def _resolve_cli_path(self) -> str: - configured = ( - os.environ.get("LLM_CONNECT_CLAUDE_CLI_PATH") - or os.environ.get("CLAUDE_CLI_PATH") - or self._config.claude_cli_path - ) - if configured and configured != "claude": - return configured - - local_cli = Path.home() / ".local" / "bin" / "claude" - if local_cli.exists(): - return str(local_cli) - return configured or "claude" - - -def _json_schema_arg(config: RunConfig) -> str | None: - schema = (config.model_params or {}).get("json_schema") - if not schema: - return None - if isinstance(schema, str): - return schema - if isinstance(schema, dict): - return json.dumps(schema, separators=(",", ":")) - return None - - -# Envelope field names Claude Code's `--output-format json` is known to use -# for the model's primary textual response. Used as a fall-back when no field -# carries a JSON-parseable payload (e.g. plain prose generation). -_ENVELOPE_TEXT_FIELDS = ("result", "result_text", "content", "text", "output") - - -def _unwrap_cli_json_envelope(stdout: str, config: RunConfig) -> str: - """Extract the model's payload from Claude CLI's --output-format json envelope. - - Only runs when --json-schema was set (the only code path that adds - --output-format json to the CLI invocation). Other callers keep the raw - stdout behavior unchanged. - - Strategy: when --json-schema is set the caller wants JSON back, so prefer - any envelope field whose value is itself valid JSON (dict, list, or a - string that parses as JSON). This handles two observed envelope shapes: - - 1. Short prompts where the model emits the structured payload directly - in the `result` field as a JSON-encoded string. - 2. Longer prompts where the model emits a conversational preamble in - `result` and the schema-enforced JSON in a separate field (the exact - field name varies across CLI versions). - - Fall back to the first text field only when no JSON-bearing field exists, - so non-schema callers via this code path still see the model's prose. - Surface the raw envelope as a last resort so the operator can see what - shape arrived and extend the strategy. - """ - if not _json_schema_arg(config): - return stdout - text = stdout.strip() - if not text: - return stdout - try: - envelope = json.loads(text) - except json.JSONDecodeError: - return stdout - if not isinstance(envelope, dict): - return stdout - - json_payload = _find_json_payload(envelope) - if json_payload is not None: - return json_payload - - for key in _ENVELOPE_TEXT_FIELDS: - value = envelope.get(key) - if isinstance(value, str): - return value - if isinstance(value, (dict, list)): - return json.dumps(value) - - return stdout - - -def _find_json_payload(envelope: dict) -> str | None: - """Return the first envelope value that represents valid JSON. - - Insertion order is preserved by Python dicts, so this prefers fields the - CLI lists earliest in its envelope. Skips obvious metadata keys (cost, - usage, timing) so we never accidentally pick a numeric or telemetry value. - """ - for key, value in envelope.items(): - if key in _ENVELOPE_METADATA_KEYS: - continue - if isinstance(value, (dict, list)): - return json.dumps(value) - if isinstance(value, str): - stripped = value.strip() - if stripped.startswith(("{", "[")): - try: - json.loads(stripped) - except json.JSONDecodeError: - continue - return stripped - return None - - -# Envelope keys that carry telemetry, never the model payload. -_ENVELOPE_METADATA_KEYS = frozenset({ - "type", "subtype", "model", "usage", "total_cost_usd", "cost_usd", - "duration_ms", "duration_api_ms", "num_turns", "session_id", - "is_error", "stop_reason", "permission_denials", "uuid", -}) +""" +Claude Code CLI adapter - runs the ``claude`` CLI as a subprocess. +""" + +import asyncio +import json +import os +import subprocess +from pathlib import Path +from typing import Optional + +from llm_connect._diagnostics import ( + record_adapter_transformation, + record_provider_request, + record_provider_response, +) +from llm_connect._token_estimator import estimate_tokens +from llm_connect.adapter import LLMAdapter +from llm_connect.config import LLMConfig +from llm_connect.exceptions import LLMSubprocessError, LLMTimeoutError +from llm_connect.models import LLMResponse, RunConfig + + +class ClaudeCodeAdapter(LLMAdapter): + """LLM adapter that shells out to the ``claude`` CLI with ``--print``. + + The compiled prompt is piped via stdin to avoid shell argument length + limits. Compiled prompts can exceed 30 KB. + """ + + def __init__( + self, + cli_path: Optional[str] = None, + model: Optional[str] = None, + config: Optional[LLMConfig] = None, + ): + self._config = config or LLMConfig(provider="claude-code") + self._cli_path = cli_path or self._resolve_cli_path() + self._model = model + + # LLMAdapter interface + + def execute_prompt(self, prompt: str, config: RunConfig) -> LLMResponse: + self._preflight_budget(config) + cmd = self._build_command(config) + + timeout = config.timeout_seconds or self._config.timeout_seconds + record_provider_request(command=cmd, payload={"stdin": prompt}) + + try: + result = subprocess.run( + cmd, + input=prompt, + capture_output=True, + text=True, + timeout=timeout, + ) + except subprocess.TimeoutExpired as exc: + raise LLMTimeoutError( + f"claude CLI timed out after {timeout}s", + cause=exc, + ) from exc + + record_provider_response( + status=result.returncode, + body={"stdout": result.stdout, "stderr": result.stderr}, + ) + if result.returncode != 0: + raise LLMSubprocessError( + f"claude CLI exited with code {result.returncode}", + return_code=result.returncode, + stderr=result.stderr, + ) + + content = _unwrap_cli_json_envelope(result.stdout, config) + prompt_tokens = estimate_tokens(prompt) + completion_tokens = estimate_tokens(content) + + response = LLMResponse( + content=content, + model=self._model or "claude-code-cli", + usage={ + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + }, + finish_reason="stop", + metadata={ + "provider": "claude-code", + "cli_path": self._cli_path, + }, + ) + self._consume_budget(config, response) + return response + + async def async_execute_prompt(self, prompt: str, config: RunConfig) -> LLMResponse: + """Native async implementation using asyncio.create_subprocess_exec.""" + self._preflight_budget(config) + cmd = self._build_command(config) + + timeout = config.timeout_seconds or self._config.timeout_seconds + record_provider_request(command=cmd, payload={"stdin": prompt}) + + try: + proc = await asyncio.create_subprocess_exec( + *cmd, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout_bytes, stderr_bytes = await asyncio.wait_for( + proc.communicate(input=prompt.encode()), + timeout=timeout, + ) + except asyncio.TimeoutError as exc: + raise LLMTimeoutError( + f"claude CLI timed out after {timeout}s", + cause=exc, + ) from exc + + stdout = stdout_bytes.decode() + stderr = stderr_bytes.decode() + record_provider_response( + status=proc.returncode, + body={"stdout": stdout, "stderr": stderr}, + ) + if proc.returncode != 0: + raise LLMSubprocessError( + f"claude CLI exited with code {proc.returncode}", + return_code=proc.returncode, + stderr=stderr, + ) + + content = _unwrap_cli_json_envelope(stdout, config) + prompt_tokens = estimate_tokens(prompt) + completion_tokens = estimate_tokens(content) + + response = LLMResponse( + content=content, + model=self._model or "claude-code-cli", + usage={ + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + }, + finish_reason="stop", + metadata={ + "provider": "claude-code", + "cli_path": self._cli_path, + "async": True, + }, + ) + self._consume_budget(config, response) + return response + + def validate_config(self, config: RunConfig) -> bool: + try: + result = subprocess.run( + [self._cli_path, "--version"], + capture_output=True, + text=True, + timeout=10, + ) + return result.returncode == 0 + except (subprocess.TimeoutExpired, FileNotFoundError, OSError): + return False + + def _build_command(self, config: RunConfig) -> list[str]: + cmd = [self._cli_path, "--print"] + if self._model: + cmd.extend(["--model", self._model]) + + json_schema = _json_schema_arg(config) + if json_schema: + cmd.extend(["--json-schema", json_schema]) + # With --json-schema alone the CLI prints conversational text on + # stdout while the structured payload ships on a sidecar channel + # callers cannot reach. --output-format json forces the structured + # response (wrapped in an envelope) onto stdout. + cmd.extend(["--output-format", "json"]) + return cmd + + def _resolve_cli_path(self) -> str: + configured = ( + os.environ.get("LLM_CONNECT_CLAUDE_CLI_PATH") + or os.environ.get("CLAUDE_CLI_PATH") + or self._config.claude_cli_path + ) + if configured and configured != "claude": + return configured + + local_cli = Path.home() / ".local" / "bin" / "claude" + if local_cli.exists(): + return str(local_cli) + return configured or "claude" + + +def _json_schema_arg(config: RunConfig) -> str | None: + schema = (config.model_params or {}).get("json_schema") + if not schema: + return None + if isinstance(schema, str): + return schema + if isinstance(schema, dict): + return json.dumps(schema, separators=(",", ":")) + return None + + +# Envelope field names Claude Code's --output-format json is known to use for +# the model's primary textual response. Used as a fallback when no field carries +# a JSON-parseable payload, such as plain prose generation. +_ENVELOPE_TEXT_FIELDS = ("result", "result_text", "content", "text", "output") + + +def _unwrap_cli_json_envelope(stdout: str, config: RunConfig) -> str: + """Extract the model's payload from Claude CLI's --output-format json envelope. + + Only runs when --json-schema was set. Other callers keep the raw stdout + behavior unchanged. + """ + if not _json_schema_arg(config): + return stdout + text = stdout.strip() + if not text: + return stdout + try: + envelope = json.loads(text) + except json.JSONDecodeError: + return stdout + if not isinstance(envelope, dict): + return stdout + + json_payload = _find_json_payload(envelope) + if json_payload is not None: + return _record_unwrap(stdout, json_payload) + + for key in _ENVELOPE_TEXT_FIELDS: + value = envelope.get(key) + if isinstance(value, str): + return _record_unwrap(stdout, value) + if isinstance(value, (dict, list)): + return _record_unwrap(stdout, json.dumps(value)) + + return stdout + + +def _find_json_payload(envelope: dict) -> str | None: + """Return the first envelope value that represents valid JSON.""" + for key, value in envelope.items(): + if key in _ENVELOPE_METADATA_KEYS: + continue + if isinstance(value, (dict, list)): + return json.dumps(value) + if isinstance(value, str): + stripped = value.strip() + if stripped.startswith(("{", "[")): + try: + json.loads(stripped) + except json.JSONDecodeError: + continue + return stripped + return None + + +# Envelope keys that carry telemetry, never the model payload. +_ENVELOPE_METADATA_KEYS = frozenset( + { + "type", + "subtype", + "model", + "usage", + "total_cost_usd", + "cost_usd", + "duration_ms", + "duration_api_ms", + "num_turns", + "session_id", + "is_error", + "stop_reason", + "permission_denials", + "uuid", + } +) + + +def _record_unwrap(stdout: str, content: str) -> str: + if content != stdout: + record_adapter_transformation("unwrap_cli_envelope", stdout, content) + return content diff --git a/llm_connect/gemini.py b/llm_connect/gemini.py index 7d5db8d..c6e674c 100644 --- a/llm_connect/gemini.py +++ b/llm_connect/gemini.py @@ -9,6 +9,7 @@ from llm_connect.adapter import LLMAdapter from llm_connect.models import RunConfig, LLMResponse from llm_connect.config import resolve_api_key, find_project_root from llm_connect._http import post_json +from llm_connect._payload import merge_gemini_model_params from llm_connect.exceptions import LLMConfigurationError _DEFAULT_MODEL = "gemini-2.5-flash" @@ -74,6 +75,8 @@ class GeminiAdapter(LLMAdapter): "maxOutputTokens": config.max_tokens, }, } + if config.model_params: + merge_gemini_model_params(payload, config.model_params) url = f"{_API_BASE}/models/{model}:generateContent?key={self._api_key}" diff --git a/llm_connect/openai.py b/llm_connect/openai.py index c0c76d2..14e8e6f 100644 --- a/llm_connect/openai.py +++ b/llm_connect/openai.py @@ -9,6 +9,7 @@ from llm_connect.adapter import LLMAdapter from llm_connect.models import RunConfig, LLMResponse from llm_connect.config import resolve_api_key, find_project_root from llm_connect._http import post_json +from llm_connect._payload import merge_openai_chat_model_params from llm_connect.exceptions import ( LLMConfigurationError, LLMAPIError, @@ -65,6 +66,8 @@ class OpenAIAdapter(LLMAdapter): "temperature": config.temperature, "max_tokens": config.max_tokens, } + if config.model_params: + merge_openai_chat_model_params(payload, config.model_params) headers = { "Authorization": f"Bearer {self._api_key}", diff --git a/llm_connect/openrouter.py b/llm_connect/openrouter.py index 623c5a8..c4027da 100644 --- a/llm_connect/openrouter.py +++ b/llm_connect/openrouter.py @@ -1,221 +1,151 @@ -""" -OpenRouter adapter — calls the OpenAI-compatible chat completions API. -""" - -import time -from typing import Optional, Dict, Any - -from llm_connect.adapter import LLMAdapter -from llm_connect.models import RunConfig, LLMResponse -from llm_connect.config import LLMConfig, resolve_api_key, find_project_root -from llm_connect._http import post_json -from llm_connect.exceptions import ( - LLMConfigurationError, - LLMAPIError, - LLMRateLimitError, -) - -_DEFAULT_MODEL = "anthropic/claude-sonnet-4" - - -class OpenRouterAdapter(LLMAdapter): - """LLM adapter that calls the OpenRouter chat completions endpoint. - - Constructor args override values from *config*; *config* overrides - global defaults. The model used for a given call is resolved as: - ``constructor model > RunConfig.model_name > default``. - """ - - def __init__( - self, - model: Optional[str] = None, - api_key: Optional[str] = None, - api_base: Optional[str] = None, - config: Optional[LLMConfig] = None, - system_prompt: Optional[str] = None, - extra_headers: Optional[Dict[str, str]] = None, - max_retries: Optional[int] = None, - ): - self._config = config or LLMConfig() - # Track whether the model was explicitly supplied (constructor or - # LLMConfig). Comparing self._model to _DEFAULT_MODEL is not enough — - # callers who pass --model anthropic/claude-sonnet-4 happen to match - # the default and would otherwise be misrouted to RunConfig.model_name - # (which defaults to "gpt-4" — quietly sending every call to OpenAI's - # gpt-4 model, which is what broke the activity-core CUST-WP-0045 - # canary on 2026-06-02). - self._explicit_model = model is not None or self._config.model is not None - self._model = model or self._config.model or _DEFAULT_MODEL - self._api_base = (api_base or self._config.api_base).rstrip("/") - self._system_prompt = system_prompt - self._extra_headers = extra_headers or {} - self._max_retries = max_retries if max_retries is not None else self._config.max_retries - - # Resolve API key - root = find_project_root() - key_file_paths = [root / "apikey-openrouter.txt"] if root else [] - self._api_key = resolve_api_key( - explicit=api_key or self._config.api_key, - env_var="OPENROUTER_API_KEY", - key_file_paths=key_file_paths, - ) - - # ── LLMAdapter interface ──────────────────────────────────────── - - def execute_prompt(self, prompt: str, config: RunConfig) -> LLMResponse: - self._preflight_budget(config) - # Explicit constructor/LLMConfig model wins; only fall back to the - # per-call RunConfig.model_name when the adapter wasn't told what to - # use. RunConfig.model_name defaults to "gpt-4", so falling back - # unconditionally would silently misroute callers. - if self._explicit_model: - model = self._model - else: - model = config.model_name or self._model - - messages: list[Dict[str, str]] = [] - if self._system_prompt: - messages.append({"role": "system", "content": self._system_prompt}) - messages.append({"role": "user", "content": prompt}) - - payload: Dict[str, Any] = { - "model": model, - "messages": messages, - "temperature": config.temperature, - "max_tokens": config.max_tokens, - } - if config.model_params: - _merge_model_params(payload, config.model_params) - - headers = { - "Authorization": f"Bearer {self._api_key}", - **self._extra_headers, - } - url = f"{self._api_base}/chat/completions" - - start = time.time() - data = self._post_with_retries(url, payload, headers, config.timeout_seconds) - latency = time.time() - start - - # Parse response - choice = data.get("choices", [{}])[0] - content = choice.get("message", {}).get("content", "") - finish_reason = choice.get("finish_reason", "stop") - usage = data.get("usage", {}) - - response = LLMResponse( - content=content, - model=data.get("model", model), - usage={ - "prompt_tokens": usage.get("prompt_tokens", 0), - "completion_tokens": usage.get("completion_tokens", 0), - "total_tokens": usage.get("total_tokens", 0), - }, - finish_reason=finish_reason, - metadata={ - "provider": "openrouter", - "latency_seconds": round(latency, 3), - "response_id": data.get("id", ""), - }, - ) - self._consume_budget(config, response) - return response - - def validate_config(self, config: RunConfig) -> bool: - if not self._api_key: - return False - if not (self._model or config.model_name): - return False - if not (0.0 <= config.temperature <= 2.0): - return False - return True - - # ── Internals ─────────────────────────────────────────────────── - - def _post_with_retries( - self, - url: str, - payload: Dict[str, Any], - headers: Dict[str, str], - timeout: int, - ) -> Dict[str, Any]: - last_exc: Optional[Exception] = None - for attempt in range(self._max_retries + 1): - try: - return post_json(url, payload, headers, timeout=timeout) - except LLMRateLimitError as exc: - last_exc = exc - if attempt < self._max_retries: - time.sleep(2 ** attempt) - except LLMAPIError as exc: - if exc.status_code >= 500 and attempt < self._max_retries: - last_exc = exc - time.sleep(2 ** attempt) - else: - raise - raise last_exc # type: ignore[misc] - - -# OpenAI Chat Completions fields that map straight through from model_params. -# Anything not in this set is provider-specific and must be either translated -# or dropped — we never blind-merge into the payload, because OpenRouter -# rejects unknown top-level fields with HTTP 400. -_OPENAI_PASSTHROUGH_FIELDS = frozenset({ - "top_p", "n", "stream", "stop", "presence_penalty", - "frequency_penalty", "logit_bias", "user", "seed", - "tools", "tool_choice", "response_format", - "logprobs", "top_logprobs", "parallel_tool_calls", -}) - -# Provider-specific model_params keys that have no OpenAI Chat Completions -# equivalent and must be silently dropped to keep payloads valid. -_DROPPED_NON_OPENAI_FIELDS = frozenset({ - "reasoning_effort", # Claude CLI / Anthropic-specific - "max_depth", # llm-connect's own depth knob - "claude_cli_path", # adapter wiring leak - "json_schema", # translated below into response_format -}) - - -def _merge_model_params(payload: Dict[str, Any], model_params: Dict[str, Any]) -> None: - """Merge RunConfig.model_params into an OpenAI Chat Completions payload. - - Pass-through whitelisted OpenAI keys, translate json_schema into the - proper response_format wrapper, drop known provider-specific fields, - and ignore anything else rather than letting it through and triggering - a 400 from OpenRouter (the failure mode that hit CUST-WP-0045 on - 2026-06-02 — reasoning_effort and a top-level json_schema were merged - into the body and the API rejected both). - """ - schema = model_params.get("json_schema") - if schema is not None and "response_format" not in payload: - if isinstance(schema, str): - try: - import json as _json - schema = _json.loads(schema) - except (ValueError, TypeError): - schema = None - if isinstance(schema, dict): - # strict=False: OpenAI's strict mode requires additionalProperties - # to be false on every object and every property in the required - # list. Most application-supplied schemas are not written that - # way (the activity-core daily-triage schema, for example, has - # neither). With strict=False, OpenRouter still honours the - # schema as a soft constraint and the model's output remains - # structured. Callers can opt back into strict by including - # `strict: true` themselves in a custom `response_format`. - payload["response_format"] = { - "type": "json_schema", - "json_schema": { - "name": "structured_output", - "schema": schema, - "strict": False, - }, - } - - for key, value in model_params.items(): - if key in _DROPPED_NON_OPENAI_FIELDS: - continue - if key in _OPENAI_PASSTHROUGH_FIELDS: - payload[key] = value - # else: silently drop unknown keys rather than risk a 400. +""" +OpenRouter adapter - calls the OpenAI-compatible chat completions API. +""" + +import time +from typing import Any, Dict, Optional + +from llm_connect._http import post_json +from llm_connect._payload import merge_openai_chat_model_params +from llm_connect.adapter import LLMAdapter +from llm_connect.config import LLMConfig, find_project_root, resolve_api_key +from llm_connect.exceptions import LLMAPIError, LLMRateLimitError +from llm_connect.models import LLMResponse, RunConfig + +_DEFAULT_MODEL = "anthropic/claude-sonnet-4" + + +class OpenRouterAdapter(LLMAdapter): + """LLM adapter that calls the OpenRouter chat completions endpoint. + + Constructor args override values from *config*; *config* overrides + global defaults. The model used for a given call is resolved as: + ``constructor model > RunConfig.model_name > default``. + """ + + def __init__( + self, + model: Optional[str] = None, + api_key: Optional[str] = None, + api_base: Optional[str] = None, + config: Optional[LLMConfig] = None, + system_prompt: Optional[str] = None, + extra_headers: Optional[Dict[str, str]] = None, + max_retries: Optional[int] = None, + ): + self._config = config or LLMConfig() + # Track whether the model was explicitly supplied (constructor or + # LLMConfig). Comparing self._model to _DEFAULT_MODEL is not enough: + # callers who pass --model anthropic/claude-sonnet-4 happen to match + # the default and would otherwise be misrouted to RunConfig.model_name + # (which defaults to "gpt-4", quietly sending every call to OpenAI's + # gpt-4 model, which is what broke the activity-core CUST-WP-0045 + # canary on 2026-06-02). + self._explicit_model = model is not None or self._config.model is not None + self._model = model or self._config.model or _DEFAULT_MODEL + self._api_base = (api_base or self._config.api_base).rstrip("/") + self._system_prompt = system_prompt + self._extra_headers = extra_headers or {} + self._max_retries = max_retries if max_retries is not None else self._config.max_retries + + root = find_project_root() + key_file_paths = [root / "apikey-openrouter.txt"] if root else [] + self._api_key = resolve_api_key( + explicit=api_key or self._config.api_key, + env_var="OPENROUTER_API_KEY", + key_file_paths=key_file_paths, + ) + + # LLMAdapter interface + + def execute_prompt(self, prompt: str, config: RunConfig) -> LLMResponse: + self._preflight_budget(config) + # Explicit constructor/LLMConfig model wins; only fall back to the + # per-call RunConfig.model_name when the adapter was not told what to + # use. RunConfig.model_name defaults to "gpt-4", so falling back + # unconditionally would silently misroute callers. + if self._explicit_model: + model = self._model + else: + model = config.model_name or self._model + + messages: list[Dict[str, str]] = [] + if self._system_prompt: + messages.append({"role": "system", "content": self._system_prompt}) + messages.append({"role": "user", "content": prompt}) + + payload: Dict[str, Any] = { + "model": model, + "messages": messages, + "temperature": config.temperature, + "max_tokens": config.max_tokens, + } + if config.model_params: + merge_openai_chat_model_params(payload, config.model_params) + + headers = { + "Authorization": f"Bearer {self._api_key}", + **self._extra_headers, + } + url = f"{self._api_base}/chat/completions" + + start = time.time() + data = self._post_with_retries(url, payload, headers, config.timeout_seconds) + latency = time.time() - start + + choice = data.get("choices", [{}])[0] + content = choice.get("message", {}).get("content", "") + finish_reason = choice.get("finish_reason", "stop") + usage = data.get("usage", {}) + + response = LLMResponse( + content=content, + model=data.get("model", model), + usage={ + "prompt_tokens": usage.get("prompt_tokens", 0), + "completion_tokens": usage.get("completion_tokens", 0), + "total_tokens": usage.get("total_tokens", 0), + }, + finish_reason=finish_reason, + metadata={ + "provider": "openrouter", + "latency_seconds": round(latency, 3), + "response_id": data.get("id", ""), + }, + ) + self._consume_budget(config, response) + return response + + def validate_config(self, config: RunConfig) -> bool: + if not self._api_key: + return False + if not (self._model or config.model_name): + return False + if not (0.0 <= config.temperature <= 2.0): + return False + return True + + # Internals + + def _post_with_retries( + self, + url: str, + payload: Dict[str, Any], + headers: Dict[str, str], + timeout: int, + ) -> Dict[str, Any]: + last_exc: Optional[Exception] = None + for attempt in range(self._max_retries + 1): + try: + return post_json(url, payload, headers, timeout=timeout) + except LLMRateLimitError as exc: + last_exc = exc + if attempt < self._max_retries: + time.sleep(2 ** attempt) + except LLMAPIError as exc: + if exc.status_code >= 500 and attempt < self._max_retries: + last_exc = exc + time.sleep(2 ** attempt) + else: + raise + raise last_exc # type: ignore[misc] diff --git a/llm_connect/replay.py b/llm_connect/replay.py new file mode 100644 index 0000000..082c122 --- /dev/null +++ b/llm_connect/replay.py @@ -0,0 +1,121 @@ +"""Replay llm-connect audit records without making provider calls.""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any + +from llm_connect.claude_code import _unwrap_cli_json_envelope +from llm_connect.models import RunConfig + + +def parse_audit_record(record: dict[str, Any]) -> dict[str, Any]: + """Parse the recorded provider response and compare it to saved content.""" + + config = RunConfig.from_dict(record.get("config", {})) + provider = record.get("provider") or _infer_provider(record) + provider_response = record.get("provider_response") or {} + body = provider_response.get("body") + parsed_content = _parse_provider_response(provider, body, config) + recorded_content = record.get("parsed_content") + schema_check = _check_structured_output(parsed_content, config.model_params.get("json_schema")) + + return { + "provider": provider, + "parsed_content": parsed_content, + "matches_recorded_content": parsed_content == recorded_content, + "structured_output": schema_check, + } + + +def main(argv: list[str] | None = None) -> None: + parser = argparse.ArgumentParser( + prog="python -m llm_connect.replay", + description="Replay parsing for a llm-connect audit JSON file.", + ) + parser.add_argument("audit_file", help="Path to an audit JSON file") + parser.add_argument("--json", action="store_true", help="Print the full replay report") + args = parser.parse_args(argv) + + record = json.loads(Path(args.audit_file).read_text(encoding="utf-8")) + report = parse_audit_record(record) + if args.json: + print(json.dumps(report, indent=2, sort_keys=True)) + else: + print(report["parsed_content"]) + + +def _parse_provider_response(provider: str | None, body: Any, config: RunConfig) -> str: + if provider in {"openai", "openrouter"}: + if isinstance(body, dict): + choice = (body.get("choices") or [{}])[0] + return choice.get("message", {}).get("content", "") + return "" + + if provider == "gemini": + if isinstance(body, dict): + candidates = body.get("candidates") or [] + if not candidates: + return "" + parts = candidates[0].get("content", {}).get("parts", []) + return "".join(part.get("text", "") for part in parts) + return "" + + if provider == "claude-code": + if isinstance(body, dict): + return _unwrap_cli_json_envelope(body.get("stdout", ""), config) + return "" + + if isinstance(body, str): + return body + if body is None: + return "" + return json.dumps(body) + + +def _infer_provider(record: dict[str, Any]) -> str | None: + request = record.get("provider_request") or {} + url = request.get("url", "") + if "openrouter.ai" in url: + return "openrouter" + if "api.openai.com" in url: + return "openai" + if "generativelanguage.googleapis.com" in url: + return "gemini" + if request.get("command"): + return "claude-code" + return None + + +def _check_structured_output(content: str, schema: Any) -> dict[str, Any]: + if not schema: + return {"checked": False} + if isinstance(schema, str): + try: + schema = json.loads(schema) + except ValueError as exc: + return {"checked": True, "valid": False, "error": f"invalid schema JSON: {exc}"} + if not isinstance(schema, dict): + return {"checked": True, "valid": False, "error": "schema must be an object"} + + try: + parsed = json.loads(content) + except ValueError as exc: + return {"checked": True, "valid": False, "error": f"invalid output JSON: {exc}"} + + missing = [] + if schema.get("type") == "object": + if not isinstance(parsed, dict): + return {"checked": True, "valid": False, "error": "output is not an object"} + for key in schema.get("required", []): + if key not in parsed: + missing.append(key) + if missing: + return {"checked": True, "valid": False, "missing_required": missing} + return {"checked": True, "valid": True} + + +if __name__ == "__main__": + main() diff --git a/llm_connect/server.py b/llm_connect/server.py index d7f1093..93f52a4 100644 --- a/llm_connect/server.py +++ b/llm_connect/server.py @@ -21,13 +21,21 @@ Usage (CLI):: """ import argparse +import datetime as _dt import json +import os +import re import threading -from http.server import BaseHTTPRequestHandler, HTTPServer +import time +import uuid +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from pathlib import Path from typing import Optional +from urllib.parse import parse_qs, urlsplit +from llm_connect._diagnostics import capture_diagnostics from llm_connect.adapter import LLMAdapter -from llm_connect.models import RunConfig +from llm_connect.models import LLMResponse, RunConfig class _Handler(BaseHTTPRequestHandler): @@ -39,7 +47,8 @@ class _Handler(BaseHTTPRequestHandler): # ── GET ──────────────────────────────────────────────────────── def do_GET(self): - if self.path == "/health": + parsed = urlsplit(self.path) + if parsed.path == "/health": self._respond(200, {"status": "ok"}) else: self._respond(404, {"error": "not found"}) @@ -47,10 +56,13 @@ class _Handler(BaseHTTPRequestHandler): # ── POST ─────────────────────────────────────────────────────── def do_POST(self): - if self.path != "/execute": + parsed = urlsplit(self.path) + if parsed.path != "/execute": self._respond(404, {"error": "not found"}) return + debug_enabled = _debug_requested(parsed.query) + audit_dir = os.environ.get("LLM_CONNECT_AUDIT_DIR") length = int(self.headers.get("Content-Length", 0)) raw = self.rfile.read(length) try: @@ -70,9 +82,19 @@ class _Handler(BaseHTTPRequestHandler): return config = RunConfig.from_dict(cfg) + start = time.time() + diagnostics_enabled = debug_enabled or bool(audit_dir) try: - response = self.server.adapter.execute_prompt(prompt, config) # type: ignore[attr-defined] - self._respond(200, response.to_dict()) + with capture_diagnostics(diagnostics_enabled) as diagnostics: + response = self.server.adapter.execute_prompt(prompt, config) # type: ignore[attr-defined] + latency = time.time() - start + body = response.to_dict() + debug = diagnostics.to_dict() if diagnostics is not None else None + if debug_enabled and debug is not None: + body["debug"] = debug + if audit_dir: + _write_audit_record(audit_dir, prompt, config, response, debug, latency) + self._respond(200, body) except Exception as exc: self._respond(500, {"error": str(exc)}) @@ -102,7 +124,7 @@ class LLMServer: host: str = "127.0.0.1", port: int = 8080, ) -> None: - self._httpd = HTTPServer((host, port), _Handler) + self._httpd = ThreadingHTTPServer((host, port), _Handler) self._httpd.adapter = adapter # type: ignore[attr-defined] self._thread: Optional[threading.Thread] = None @@ -138,6 +160,55 @@ def _build_adapter(provider: str, model: Optional[str]) -> LLMAdapter: return create_adapter(provider, model=model) +def _debug_requested(query: str) -> bool: + env = os.environ.get("LLM_CONNECT_DEBUG", "") + if _truthy(env): + return True + values = parse_qs(query).get("debug", []) + return any(_truthy(value) for value in values) + + +def _truthy(value: str) -> bool: + return value.strip().lower() in {"1", "true", "yes", "on"} + + +def _write_audit_record( + audit_dir: str, + prompt: str, + config: RunConfig, + response: LLMResponse, + debug: dict | None, + latency_seconds: float, +) -> None: + target_dir = Path(audit_dir) + target_dir.mkdir(parents=True, exist_ok=True) + + now = _dt.datetime.now(_dt.timezone.utc) + response_id = str(response.metadata.get("response_id") or uuid.uuid4().hex) + filename = f"{now.strftime('%Y%m%dT%H%M%S%fZ')}-{_safe_filename(response_id)}.json" + diagnostics = debug or {} + record = { + "timestamp": now.isoformat().replace("+00:00", "Z"), + "prompt": prompt, + "config": config.to_dict(), + "provider": response.metadata.get("provider"), + "provider_request": diagnostics.get("provider_request"), + "provider_response": diagnostics.get("provider_response"), + "adapter_transformations": diagnostics.get("adapter_transformations", []), + "parsed_content": response.content, + "latency_seconds": round(latency_seconds, 3), + "response": response.to_dict(), + } + (target_dir / filename).write_text( + json.dumps(record, indent=2, sort_keys=True), + encoding="utf-8", + ) + + +def _safe_filename(value: str) -> str: + return re.sub(r"[^A-Za-z0-9_.-]+", "-", value).strip("-") or "response" + + def main(argv=None) -> None: parser = argparse.ArgumentParser( prog="python -m llm_connect.server", diff --git a/tests/test_payload.py b/tests/test_payload.py new file mode 100644 index 0000000..4ecc934 --- /dev/null +++ b/tests/test_payload.py @@ -0,0 +1,81 @@ +from llm_connect._payload import merge_gemini_model_params, merge_openai_chat_model_params + + +STRUCTURED_SCHEMA = { + "type": "object", + "properties": { + "summary": {"type": "string"}, + "recommendations": {"type": "array", "items": {"type": "string"}}, + }, + "required": ["summary", "recommendations"], +} + + +ACTIVITY_CORE_MODEL_PARAMS = { + "reasoning_effort": "medium", + "max_depth": 4, + "json_schema": STRUCTURED_SCHEMA, + "top_p": 0.8, +} + + +def test_openai_chat_model_params_translate_activity_core_shape(): + payload = { + "model": "gpt-4.1-mini", + "messages": [{"role": "user", "content": "triage"}], + "temperature": 0.2, + "max_tokens": 200, + } + + merge_openai_chat_model_params(payload, ACTIVITY_CORE_MODEL_PARAMS) + + assert payload["response_format"] == { + "type": "json_schema", + "json_schema": { + "name": "structured_output", + "schema": STRUCTURED_SCHEMA, + "strict": False, + }, + } + assert payload["top_p"] == 0.8 + assert "reasoning_effort" not in payload + assert "max_depth" not in payload + assert "json_schema" not in payload + + +def test_openai_chat_model_params_preserve_explicit_response_format(): + explicit = { + "type": "json_schema", + "json_schema": { + "name": "custom", + "schema": STRUCTURED_SCHEMA, + "strict": True, + }, + } + payload = {"model": "gpt-4.1-mini", "messages": []} + + merge_openai_chat_model_params( + payload, + {"json_schema": STRUCTURED_SCHEMA, "response_format": explicit}, + ) + + assert payload["response_format"] == explicit + + +def test_gemini_model_params_translate_activity_core_shape(): + payload = { + "contents": [{"role": "user", "parts": [{"text": "triage"}]}], + "generationConfig": { + "temperature": 0.2, + "maxOutputTokens": 200, + }, + } + + merge_gemini_model_params(payload, ACTIVITY_CORE_MODEL_PARAMS) + + assert payload["generationConfig"]["responseMimeType"] == "application/json" + assert payload["generationConfig"]["responseSchema"] == STRUCTURED_SCHEMA + assert payload["generationConfig"]["topP"] == 0.8 + assert "reasoning_effort" not in payload + assert "max_depth" not in payload + assert "json_schema" not in payload diff --git a/tests/test_replay.py b/tests/test_replay.py new file mode 100644 index 0000000..a907f46 --- /dev/null +++ b/tests/test_replay.py @@ -0,0 +1,62 @@ +from llm_connect.replay import parse_audit_record + + +STRUCTURED_SCHEMA = { + "type": "object", + "properties": { + "summary": {"type": "string"}, + "recommendations": {"type": "array", "items": {"type": "string"}}, + }, + "required": ["summary", "recommendations"], +} + + +def test_replay_parses_openai_style_provider_response(): + record = { + "provider": "openrouter", + "config": {"model_params": {"json_schema": STRUCTURED_SCHEMA}}, + "provider_response": { + "status": 200, + "body": { + "choices": [ + { + "message": { + "content": '{"summary":"ok","recommendations":[]}' + } + } + ] + }, + }, + "parsed_content": '{"summary":"ok","recommendations":[]}', + } + + report = parse_audit_record(record) + + assert report["parsed_content"] == '{"summary":"ok","recommendations":[]}' + assert report["matches_recorded_content"] is True + assert report["structured_output"] == {"checked": True, "valid": True} + + +def test_replay_reuses_claude_code_envelope_unwrapper(): + record = { + "provider": "claude-code", + "config": {"model_params": {"json_schema": STRUCTURED_SCHEMA}}, + "provider_response": { + "status": 0, + "body": { + "stdout": ( + '{"type":"result","result":"prose",' + '"structured_result":"{\\"summary\\":\\"ok\\",' + '\\"recommendations\\":[]}"}' + ), + "stderr": "", + }, + }, + "parsed_content": '{"summary":"ok","recommendations":[]}', + } + + report = parse_audit_record(record) + + assert report["parsed_content"] == '{"summary":"ok","recommendations":[]}' + assert report["matches_recorded_content"] is True + assert report["structured_output"] == {"checked": True, "valid": True} diff --git a/tests/test_server.py b/tests/test_server.py index f417739..ac4e1a9 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -2,14 +2,22 @@ Tests for LLMServer HTTP serve mode (FR-1). """ +import threading +import time +from concurrent.futures import ThreadPoolExecutor import json import urllib.error import urllib.request import pytest +from llm_connect._diagnostics import ( + record_adapter_transformation, + record_provider_request, + record_provider_response, +) from llm_connect.adapter import MockLLMAdapter, ErrorLLMAdapter -from llm_connect.models import RunConfig +from llm_connect.models import LLMResponse, RunConfig from llm_connect.server import LLMServer @@ -45,6 +53,35 @@ def _post(url: str, body: dict) -> tuple[int, dict]: return exc.code, json.loads(exc.read()) +class DiagnosticLLMAdapter(MockLLMAdapter): + def execute_prompt(self, prompt: str, config: RunConfig) -> LLMResponse: + record_provider_request( + url="https://provider.example/v1/chat", + payload={"prompt": prompt, "model": config.model_name}, + headers={"Authorization": "Bearer secret-token"}, + ) + response = super().execute_prompt(prompt, config) + response.metadata["provider"] = "diagnostic" + response.metadata["response_id"] = "diag-response" + record_provider_response(status=200, body={"id": "diag-response", "content": response.content}) + record_adapter_transformation( + "diagnostic_transform", + {"before": prompt}, + {"after": response.content}, + ) + return response + + +class BarrierLLMAdapter(MockLLMAdapter): + def __init__(self): + super().__init__(mock_response="parallel") + self._barrier = threading.Barrier(2) + + def execute_prompt(self, prompt: str, config: RunConfig) -> LLMResponse: + self._barrier.wait(timeout=2.0) + return super().execute_prompt(prompt, config) + + class TestHealth: def test_health_returns_200(self, server): status, body = _get(f"http://127.0.0.1:{server.port}/health") @@ -65,6 +102,7 @@ class TestExecute: assert status == 200 assert body["content"] == "hello world" assert body["finish_reason"] == "stop" + assert "debug" not in body def test_response_includes_usage(self, server): status, body = _post( @@ -150,3 +188,86 @@ class TestExecute: ) assert status == 400 assert "config" in body["error"] + + def test_debug_query_returns_diagnostics(self): + s = LLMServer(adapter=DiagnosticLLMAdapter(mock_response="debug body"), port=0) + s.start() + try: + status, body = _post( + f"http://127.0.0.1:{s.port}/execute?debug=1", + {"prompt": "inspect", "config": {"model_name": "diagnostic-model"}}, + ) + finally: + s.stop() + + assert status == 200 + assert body["content"] == "debug body" + debug = body["debug"] + assert debug["provider_request"]["payload"] == { + "prompt": "inspect", + "model": "diagnostic-model", + } + assert debug["provider_request"]["headers_redacted"]["Authorization"] == "Bearer " + assert debug["provider_response"]["status"] == 200 + assert debug["adapter_transformations"][0]["step"] == "diagnostic_transform" + + def test_debug_env_returns_diagnostics(self, monkeypatch): + monkeypatch.setenv("LLM_CONNECT_DEBUG", "1") + s = LLMServer(adapter=DiagnosticLLMAdapter(mock_response="debug body"), port=0) + s.start() + try: + status, body = _post( + f"http://127.0.0.1:{s.port}/execute", + {"prompt": "inspect"}, + ) + finally: + s.stop() + + assert status == 200 + assert "debug" in body + + def test_audit_dir_records_replayable_call(self, monkeypatch, tmp_path): + monkeypatch.setenv("LLM_CONNECT_AUDIT_DIR", str(tmp_path)) + s = LLMServer(adapter=DiagnosticLLMAdapter(mock_response="audit body"), port=0) + s.start() + try: + status, body = _post( + f"http://127.0.0.1:{s.port}/execute", + {"prompt": "audit me", "config": {"model_name": "audit-model"}}, + ) + finally: + s.stop() + + assert status == 200 + assert "debug" not in body + files = list(tmp_path.glob("*.json")) + assert len(files) == 1 + record = json.loads(files[0].read_text(encoding="utf-8")) + assert record["prompt"] == "audit me" + assert record["config"]["model_name"] == "audit-model" + assert record["parsed_content"] == "audit body" + assert record["provider_request"]["headers_redacted"]["Authorization"] == "Bearer " + assert record["provider_response"]["body"]["id"] == "diag-response" + assert record["latency_seconds"] >= 0 + + def test_execute_requests_run_concurrently(self): + s = LLMServer(adapter=BarrierLLMAdapter(), port=0) + s.start() + try: + start = time.monotonic() + with ThreadPoolExecutor(max_workers=2) as pool: + futures = [ + pool.submit( + _post, + f"http://127.0.0.1:{s.port}/execute", + {"prompt": f"request {idx}"}, + ) + for idx in range(2) + ] + results = [future.result(timeout=3.0) for future in futures] + elapsed = time.monotonic() - start + finally: + s.stop() + + assert [status for status, _body in results] == [200, 200] + assert elapsed < 1.5 diff --git a/tests/test_structured_output_smoke.py b/tests/test_structured_output_smoke.py new file mode 100644 index 0000000..aeb0395 --- /dev/null +++ b/tests/test_structured_output_smoke.py @@ -0,0 +1,142 @@ +import json + +from llm_connect.gemini import GeminiAdapter +from llm_connect.models import RunConfig +from llm_connect.openai import OpenAIAdapter +from llm_connect.openrouter import OpenRouterAdapter + + +STRUCTURED_SCHEMA = { + "type": "object", + "properties": { + "summary": {"type": "string"}, + "recommendations": {"type": "array", "items": {"type": "string"}}, + }, + "required": ["summary", "recommendations"], +} + + +SMOKE_CONFIG = RunConfig( + model_name="gpt-4", + temperature=0.1, + max_tokens=300, + model_params={ + "reasoning_effort": "medium", + "max_depth": 3, + "json_schema": STRUCTURED_SCHEMA, + }, +) + + +def test_openrouter_structured_output_payload_and_model_routing(monkeypatch): + captured: dict[str, object] = {} + + def fake_post_json(url, payload, headers=None, timeout=300): # noqa: ANN001 + captured["url"] = url + captured["payload"] = payload + captured["headers"] = headers + captured["timeout"] = timeout + return { + "id": "or-response", + "model": payload["model"], + "choices": [ + { + "message": { + "content": json.dumps( + {"summary": "ok", "recommendations": ["keep payload clean"]} + ) + }, + "finish_reason": "stop", + } + ], + "usage": {"prompt_tokens": 1, "completion_tokens": 2, "total_tokens": 3}, + } + + monkeypatch.setattr("llm_connect.openrouter.post_json", fake_post_json) + adapter = OpenRouterAdapter( + model="anthropic/claude-sonnet-4", + api_key="or-test", + api_base="https://openrouter.example/api/v1", + ) + + response = adapter.execute_prompt("Return JSON.", SMOKE_CONFIG) + payload = captured["payload"] + + assert response.model == "anthropic/claude-sonnet-4" + assert payload["model"] == "anthropic/claude-sonnet-4" + assert payload["response_format"]["json_schema"]["schema"] == STRUCTURED_SCHEMA + assert payload["response_format"]["json_schema"]["strict"] is False + assert "reasoning_effort" not in payload + assert "max_depth" not in payload + assert "json_schema" not in payload + + +def test_openai_structured_output_payload(monkeypatch): + captured: dict[str, object] = {} + + def fake_post_json(url, payload, headers=None, timeout=300): # noqa: ANN001 + captured["payload"] = payload + return { + "id": "oa-response", + "model": payload["model"], + "choices": [ + { + "message": { + "content": json.dumps({"summary": "ok", "recommendations": []}) + }, + "finish_reason": "stop", + } + ], + "usage": {"prompt_tokens": 1, "completion_tokens": 2, "total_tokens": 3}, + } + + monkeypatch.setattr("llm_connect.openai.post_json", fake_post_json) + adapter = OpenAIAdapter(model="gpt-4.1-mini", api_key="sk-test") + + response = adapter.execute_prompt("Return JSON.", SMOKE_CONFIG) + payload = captured["payload"] + + assert response.model == "gpt-4.1-mini" + assert payload["model"] == "gpt-4.1-mini" + assert payload["response_format"]["json_schema"]["schema"] == STRUCTURED_SCHEMA + assert "reasoning_effort" not in payload + assert "max_depth" not in payload + assert "json_schema" not in payload + + +def test_gemini_structured_output_payload(monkeypatch): + captured: dict[str, object] = {} + + def fake_post_json(url, payload, headers=None, timeout=300): # noqa: ANN001 + captured["url"] = url + captured["payload"] = payload + return { + "candidates": [ + { + "content": { + "parts": [ + {"text": json.dumps({"summary": "ok", "recommendations": []})} + ] + }, + "finishReason": "STOP", + } + ], + "usageMetadata": { + "promptTokenCount": 1, + "candidatesTokenCount": 2, + "totalTokenCount": 3, + }, + } + + monkeypatch.setattr("llm_connect.gemini.post_json", fake_post_json) + adapter = GeminiAdapter(model="gemini-2.5-flash", api_key="gemini-test") + + response = adapter.execute_prompt("Return JSON.", SMOKE_CONFIG) + payload = captured["payload"] + + assert response.model == "gemini-2.5-flash" + assert payload["generationConfig"]["responseMimeType"] == "application/json" + assert payload["generationConfig"]["responseSchema"] == STRUCTURED_SCHEMA + assert "reasoning_effort" not in payload + assert "max_depth" not in payload + assert "json_schema" not in payload diff --git a/workplans/ADHOC-2026-06-02.md b/workplans/ADHOC-2026-06-02.md index 9304348..50b2918 100644 --- a/workplans/ADHOC-2026-06-02.md +++ b/workplans/ADHOC-2026-06-02.md @@ -4,11 +4,11 @@ type: workplan title: "Ad hoc — llm-connect lessons from CUST-WP-0045 canary" domain: custodian repo: llm-connect -status: ready +status: finished owner: custodian topic_slug: custodian created: "2026-06-02" -updated: "2026-06-02" +updated: "2026-06-03" state_hub_workstream_id: "1c936c91-79c7-427d-ab37-9052e8a61cda" --- @@ -38,7 +38,7 @@ workplan. ```task id: ADHOC-2026-06-02-T01 -status: todo +status: done priority: medium state_hub_task_id: "69626e9e-29f1-40f6-8cd2-d38a7e802293" ``` @@ -78,7 +78,7 @@ debug field is omitted in normal mode. ```task id: ADHOC-2026-06-02-T02 -status: todo +status: done priority: low state_hub_task_id: "e2b1be30-71f7-4497-9b10-b0f24d37beba" ``` @@ -101,7 +101,7 @@ max of their individual latencies, not the sum. ```task id: ADHOC-2026-06-02-T03 -status: todo +status: done priority: medium state_hub_task_id: "da4821f0-a876-44ce-9dc3-f3fc67732d0f" ``` @@ -127,7 +127,7 @@ ergonomics. ```task id: ADHOC-2026-06-02-T04 -status: todo +status: done priority: medium state_hub_task_id: "f8a033e6-22ac-4700-b8d2-43a5d76a3751" ``` @@ -155,7 +155,7 @@ forbidden top-level fields, schema in the right wrapper). ```task id: ADHOC-2026-06-02-T05 -status: todo +status: done priority: medium state_hub_task_id: "5d53dbb4-b374-45fe-b81c-ff0b222ca74f" ``` @@ -188,7 +188,7 @@ bug) before either was merged. ```task id: ADHOC-2026-06-02-T06 -status: todo +status: done priority: low state_hub_task_id: "33fcb951-d7ab-4d3c-8d67-9eebd986c711" ``` @@ -210,3 +210,21 @@ would only send OpenAI-valid fields. Codify the contract in Done when a new adapter author can read the doc and know what their `_merge_model_params` implementation must support. + +## Implementation Notes + +Completed on 2026-06-03: + +- Added opt-in `/execute` debug envelopes via `LLM_CONNECT_DEBUG=1` or + `?debug=1`, with redacted provider request/response capture and adapter + transformation records. +- Switched serve mode to `ThreadingHTTPServer` and added a concurrency + regression test. +- Added `LLM_CONNECT_AUDIT_DIR` per-call audit records plus + `python -m llm_connect.replay` for parser/unwrapper replay. +- Extracted shared OpenAI-compatible and Gemini payload translation helpers + and wired OpenRouter, OpenAI, and Gemini through them. +- Added CI-safe structured-output smoke tests that mock provider HTTP calls + and assert model routing plus payload shape. +- Documented the adapter `model_params` contract in + `docs/adapter-model-params.md`.