generated from coulomb/repo-seed
367 lines
12 KiB
Python
367 lines
12 KiB
Python
"""
|
|
Minimal HTTP server for llm_connect — serve mode (FR-1).
|
|
|
|
Exposes:
|
|
POST /execute — run a prompt through the configured adapter
|
|
GET /health — liveness probe
|
|
|
|
Usage (programmatic)::
|
|
|
|
from llm_connect import MockLLMAdapter
|
|
from llm_connect.server import LLMServer
|
|
|
|
server = LLMServer(adapter=MockLLMAdapter(), port=8080)
|
|
server.start() # background thread
|
|
# ...
|
|
server.stop()
|
|
|
|
Usage (CLI)::
|
|
|
|
python -m llm_connect.server --port 8080 --provider openrouter --model anthropic/claude-sonnet-4
|
|
"""
|
|
|
|
import argparse
|
|
import datetime as _dt
|
|
import json
|
|
import os
|
|
import re
|
|
import threading
|
|
import time
|
|
import uuid
|
|
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
from urllib.parse import parse_qs, urlsplit
|
|
|
|
from llm_connect._diagnostics import capture_diagnostics
|
|
from llm_connect.adapter import LLMAdapter
|
|
from llm_connect.exceptions import (
|
|
LLMBudgetExceededError,
|
|
LLMAPIError,
|
|
LLMConfigurationError,
|
|
LLMError,
|
|
LLMRateLimitError,
|
|
LLMTimeoutError,
|
|
)
|
|
from llm_connect.models import LLMResponse, RunConfig
|
|
from llm_connect.profiles import ProfiledLLMAdapter, default_runtime_profiles
|
|
|
|
|
|
class _Handler(BaseHTTPRequestHandler):
|
|
"""Request handler — adapter injected via server.adapter."""
|
|
|
|
def log_message(self, format, *args): # suppress default access log
|
|
pass
|
|
|
|
# ── GET ────────────────────────────────────────────────────────
|
|
|
|
def do_GET(self):
|
|
parsed = urlsplit(self.path)
|
|
if parsed.path == "/health":
|
|
self._respond(200, {"status": "ok"})
|
|
else:
|
|
self._respond(404, {"error": "not found"})
|
|
|
|
# ── POST ───────────────────────────────────────────────────────
|
|
|
|
def do_POST(self):
|
|
parsed = urlsplit(self.path)
|
|
if parsed.path != "/execute":
|
|
self._respond(404, {"error": "not found"})
|
|
return
|
|
|
|
debug_enabled = _debug_requested(parsed.query)
|
|
audit_dir = os.environ.get("LLM_CONNECT_AUDIT_DIR")
|
|
length = int(self.headers.get("Content-Length", 0))
|
|
raw = self.rfile.read(length)
|
|
try:
|
|
data = json.loads(raw)
|
|
except (json.JSONDecodeError, ValueError):
|
|
self._respond(400, {"error": "invalid JSON body"})
|
|
return
|
|
|
|
prompt = data.get("prompt")
|
|
if not prompt:
|
|
self._respond(400, {"error": "missing required field: 'prompt'"})
|
|
return
|
|
|
|
cfg = data.get("config", {})
|
|
if not isinstance(cfg, dict):
|
|
self._respond(400, {"error": "field 'config' must be an object"})
|
|
return
|
|
config = RunConfig.from_dict(cfg)
|
|
|
|
start = time.time()
|
|
diagnostics_enabled = debug_enabled or bool(audit_dir)
|
|
try:
|
|
with capture_diagnostics(diagnostics_enabled) as diagnostics:
|
|
adapter = self.server.adapter # type: ignore[attr-defined]
|
|
if not adapter.validate_config(config):
|
|
raise LLMConfigurationError(
|
|
"Adapter rejected RunConfig",
|
|
context={"model_name": config.model_name},
|
|
)
|
|
response = adapter.execute_prompt(prompt, config)
|
|
latency = time.time() - start
|
|
body = response.to_dict()
|
|
debug = diagnostics.to_dict() if diagnostics is not None else None
|
|
if debug_enabled and debug is not None:
|
|
body["debug"] = debug
|
|
if audit_dir:
|
|
_write_audit_record(audit_dir, prompt, config, response, debug, latency)
|
|
self._respond(200, body)
|
|
except Exception as exc:
|
|
status, body = _error_response(exc)
|
|
self._respond(status, body)
|
|
|
|
# ── helpers ────────────────────────────────────────────────────
|
|
|
|
def _respond(self, status: int, body: dict) -> None:
|
|
payload = json.dumps(body).encode()
|
|
self.send_response(status)
|
|
self.send_header("Content-Type", "application/json")
|
|
self.send_header("Content-Length", str(len(payload)))
|
|
self.end_headers()
|
|
self.wfile.write(payload)
|
|
|
|
|
|
class LLMServer:
|
|
"""HTTP server wrapping an :class:`~llm_connect.adapter.LLMAdapter`.
|
|
|
|
Args:
|
|
adapter: The adapter that handles ``POST /execute`` requests.
|
|
host: Bind address (default ``"127.0.0.1"``).
|
|
port: TCP port (default ``8080``; ``0`` picks a free port).
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
adapter: LLMAdapter,
|
|
host: str = "127.0.0.1",
|
|
port: int = 8080,
|
|
) -> None:
|
|
self._httpd = ThreadingHTTPServer((host, port), _Handler)
|
|
self._httpd.adapter = adapter # type: ignore[attr-defined]
|
|
self._thread: Optional[threading.Thread] = None
|
|
|
|
@property
|
|
def port(self) -> int:
|
|
"""Actual bound port (useful when ``port=0`` was requested)."""
|
|
return self._httpd.server_address[1]
|
|
|
|
@property
|
|
def host(self) -> str:
|
|
return self._httpd.server_address[0]
|
|
|
|
def start(self) -> None:
|
|
"""Start serving in a daemon background thread."""
|
|
self._thread = threading.Thread(target=self._httpd.serve_forever, daemon=True)
|
|
self._thread.start()
|
|
|
|
def stop(self) -> None:
|
|
"""Shut down the server and join the background thread."""
|
|
self._httpd.shutdown()
|
|
if self._thread is not None:
|
|
self._thread.join()
|
|
|
|
def serve_forever(self) -> None:
|
|
"""Block the calling thread until interrupted."""
|
|
self._httpd.serve_forever()
|
|
|
|
|
|
# ── CLI entry point ────────────────────────────────────────────────────────────
|
|
|
|
def _build_adapter(
|
|
provider: str,
|
|
model: Optional[str],
|
|
*,
|
|
enable_profiles: bool = True,
|
|
strict_profiles: bool = False,
|
|
) -> LLMAdapter:
|
|
from llm_connect.factory import create_adapter
|
|
|
|
adapter = create_adapter(provider, model=model)
|
|
if not enable_profiles:
|
|
return adapter
|
|
return ProfiledLLMAdapter(
|
|
adapter,
|
|
default_runtime_profiles(provider=provider, model=model),
|
|
strict_profiles=strict_profiles,
|
|
)
|
|
|
|
|
|
def _debug_requested(query: str) -> bool:
|
|
env = os.environ.get("LLM_CONNECT_DEBUG", "")
|
|
if _truthy(env):
|
|
return True
|
|
values = parse_qs(query).get("debug", [])
|
|
return any(_truthy(value) for value in values)
|
|
|
|
|
|
def _truthy(value: str) -> bool:
|
|
return value.strip().lower() in {"1", "true", "yes", "on"}
|
|
|
|
|
|
def _error_response(exc: Exception) -> tuple[int, dict]:
|
|
"""Map exceptions to operator-useful, secret-safe server responses."""
|
|
|
|
if isinstance(exc, LLMRateLimitError):
|
|
body = _error_body("provider_rate_limited", exc)
|
|
body["provider_status"] = exc.status_code
|
|
return 429, body
|
|
if isinstance(exc, LLMTimeoutError):
|
|
return 504, _error_body("provider_timeout", exc)
|
|
if isinstance(exc, LLMAPIError):
|
|
body = _error_body("provider_api_error", exc)
|
|
if exc.status_code:
|
|
body["provider_status"] = exc.status_code
|
|
return 502, body
|
|
if isinstance(exc, LLMBudgetExceededError):
|
|
return 400, _error_body("budget_exceeded", exc)
|
|
if isinstance(exc, LLMConfigurationError):
|
|
if _message(exc).startswith("Unknown LLM runtime profile"):
|
|
return 400, _error_body("unknown_profile", exc)
|
|
return 500, _error_body("configuration_error", exc)
|
|
if isinstance(exc, LLMError):
|
|
return 500, _error_body("llm_error", exc)
|
|
return 500, _error_body("internal_error", exc)
|
|
|
|
|
|
def _error_body(code: str, exc: Exception) -> dict:
|
|
body = {
|
|
"error": code,
|
|
"message": _sanitize_text(_message(exc)),
|
|
"type": exc.__class__.__name__,
|
|
}
|
|
context = getattr(exc, "context", None)
|
|
if isinstance(context, dict):
|
|
safe_context = _safe_context(context)
|
|
if safe_context:
|
|
body["context"] = safe_context
|
|
return body
|
|
|
|
|
|
def _message(exc: Exception) -> str:
|
|
if exc.args:
|
|
return str(exc.args[0])
|
|
return str(exc)
|
|
|
|
|
|
def _safe_context(context: dict) -> dict:
|
|
safe = {}
|
|
for key, value in context.items():
|
|
lowered = str(key).lower()
|
|
if any(secret_word in lowered for secret_word in ("key", "secret", "token", "password")):
|
|
safe[key] = "<redacted>"
|
|
elif isinstance(value, (str, int, float, bool)) or value is None:
|
|
safe[key] = _sanitize_text(str(value)) if isinstance(value, str) else value
|
|
else:
|
|
safe[key] = _sanitize_text(str(value))
|
|
return safe
|
|
|
|
|
|
def _sanitize_text(value: str) -> str:
|
|
value = re.sub(r"Bearer\s+[A-Za-z0-9._~+/=-]+", "Bearer <redacted>", value)
|
|
value = re.sub(r"([?&]key=)[^&\s]+", r"\1<redacted>", value)
|
|
value = re.sub(r"\bsk-[A-Za-z0-9_-]{8,}", "sk-<redacted>", value)
|
|
value = re.sub(
|
|
r"(?i)(api[_-]?key|token|secret|password)=([^,\s\]]+)",
|
|
r"\1=<redacted>",
|
|
value,
|
|
)
|
|
return value
|
|
|
|
|
|
def _write_audit_record(
|
|
audit_dir: str,
|
|
prompt: str,
|
|
config: RunConfig,
|
|
response: LLMResponse,
|
|
debug: dict | None,
|
|
latency_seconds: float,
|
|
) -> None:
|
|
target_dir = Path(audit_dir)
|
|
target_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
now = _dt.datetime.now(_dt.timezone.utc)
|
|
response_id = str(response.metadata.get("response_id") or uuid.uuid4().hex)
|
|
filename = f"{now.strftime('%Y%m%dT%H%M%S%fZ')}-{_safe_filename(response_id)}.json"
|
|
diagnostics = debug or {}
|
|
record = {
|
|
"timestamp": now.isoformat().replace("+00:00", "Z"),
|
|
"prompt": prompt,
|
|
"config": config.to_dict(),
|
|
"provider": response.metadata.get("provider"),
|
|
"provider_request": diagnostics.get("provider_request"),
|
|
"provider_response": diagnostics.get("provider_response"),
|
|
"adapter_transformations": diagnostics.get("adapter_transformations", []),
|
|
"parsed_content": response.content,
|
|
"latency_seconds": round(latency_seconds, 3),
|
|
"response": response.to_dict(),
|
|
}
|
|
(target_dir / filename).write_text(
|
|
json.dumps(record, indent=2, sort_keys=True),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
|
|
def _safe_filename(value: str) -> str:
|
|
return re.sub(r"[^A-Za-z0-9_.-]+", "-", value).strip("-") or "response"
|
|
|
|
|
|
def main(argv=None) -> None:
|
|
parser = argparse.ArgumentParser(
|
|
prog="python -m llm_connect.server",
|
|
description="Start llm_connect HTTP serve mode.",
|
|
)
|
|
parser.add_argument(
|
|
"--port",
|
|
type=int,
|
|
default=int(os.environ.get("LLM_CONNECT_PORT", "8080")),
|
|
help="TCP port (default: env LLM_CONNECT_PORT or 8080)",
|
|
)
|
|
parser.add_argument(
|
|
"--host",
|
|
default=os.environ.get("LLM_CONNECT_HOST", "127.0.0.1"),
|
|
help="Bind address (default: env LLM_CONNECT_HOST or 127.0.0.1)",
|
|
)
|
|
parser.add_argument(
|
|
"--provider",
|
|
default=os.environ.get("LLM_CONNECT_PROVIDER", "mock"),
|
|
help="Provider name passed to create_adapter (default: env LLM_CONNECT_PROVIDER or mock)",
|
|
)
|
|
parser.add_argument(
|
|
"--model",
|
|
default=os.environ.get("LLM_CONNECT_MODEL") or None,
|
|
help="Model name (default: env LLM_CONNECT_MODEL, optional)",
|
|
)
|
|
parser.add_argument(
|
|
"--disable-profiles",
|
|
action="store_true",
|
|
help="Disable server runtime profile dispatch.",
|
|
)
|
|
parser.add_argument(
|
|
"--strict-profiles",
|
|
action="store_true",
|
|
default=_truthy(os.environ.get("LLM_CONNECT_STRICT_PROFILES", "")),
|
|
help="Reject non-profile model_name values instead of passing them through.",
|
|
)
|
|
args = parser.parse_args(argv)
|
|
|
|
adapter = _build_adapter(
|
|
args.provider,
|
|
args.model,
|
|
enable_profiles=not args.disable_profiles,
|
|
strict_profiles=args.strict_profiles,
|
|
)
|
|
server = LLMServer(adapter=adapter, host=args.host, port=args.port)
|
|
print(f"llm_connect server listening on http://{args.host}:{args.port}")
|
|
try:
|
|
server.serve_forever()
|
|
except KeyboardInterrupt:
|
|
print("\nShutting down.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|