- PipelineStage now supports max_tokens to override the 4096 default - SourcePipeline records provider/model on each entity file as HTML comment - output/processing-log.yaml tracks tokens, cost, duration, retries, errors - _call_llm returns (content, metadata) for downstream traceability - _http.py wraps JSON parse errors with body preview for debugging - infospace.yaml stages: extract/map=6000 tokens, synthesize=3000 tokens Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
349 lines
11 KiB
Python
349 lines
11 KiB
Python
"""
|
|
Infospace configuration model and YAML loader.
|
|
|
|
An infospace is declared via an ``infospace.yaml`` file that specifies
|
|
its topic, disciplines, schemas, competency questions, and viability
|
|
thresholds. This module provides the data models and I/O for that
|
|
configuration.
|
|
|
|
Example ``infospace.yaml``::
|
|
|
|
topic:
|
|
name: "The Wealth of Nations"
|
|
domain: "Classical Economics"
|
|
sources: artifacts/sources/
|
|
|
|
disciplines:
|
|
- name: "Viable System Model"
|
|
path: artifacts/vsm-reference/
|
|
|
|
schemas:
|
|
entity: schemas/economic-entity-schema-v1.0.md
|
|
|
|
competency_questions: schemas/competency-questions.md
|
|
|
|
viability:
|
|
coverage_ratio: { min: 0.60 }
|
|
per_entity_mean: { min: 3.5 }
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
import yaml
|
|
|
|
|
|
@dataclass
|
|
class TopicConfig:
|
|
"""The subject matter an infospace explains.
|
|
|
|
Attributes:
|
|
name: Human-readable topic name.
|
|
domain: Broader knowledge domain.
|
|
sources: Path (relative to infospace root) to source material.
|
|
"""
|
|
|
|
name: str
|
|
domain: str = ""
|
|
sources: str = ""
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
d: Dict[str, Any] = {"name": self.name}
|
|
if self.domain:
|
|
d["domain"] = self.domain
|
|
if self.sources:
|
|
d["sources"] = self.sources
|
|
return d
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> TopicConfig:
|
|
return cls(
|
|
name=data["name"],
|
|
domain=data.get("domain", ""),
|
|
sources=data.get("sources", ""),
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class DisciplineBinding:
|
|
"""An external infospace applied as an analytical lens.
|
|
|
|
Attributes:
|
|
name: Human-readable discipline name.
|
|
path: Path to the discipline infospace (relative to root).
|
|
"""
|
|
|
|
name: str
|
|
path: str = ""
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
d: Dict[str, Any] = {"name": self.name}
|
|
if self.path:
|
|
d["path"] = self.path
|
|
return d
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> DisciplineBinding:
|
|
return cls(name=data["name"], path=data.get("path", ""))
|
|
|
|
|
|
@dataclass
|
|
class SchemaRegistry:
|
|
"""Schema paths governing entity and document structure.
|
|
|
|
All paths are relative to the infospace root directory.
|
|
"""
|
|
|
|
entity: str = ""
|
|
mapping: str = ""
|
|
analysis: str = ""
|
|
extra: Dict[str, str] = field(default_factory=dict)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
d: Dict[str, Any] = {}
|
|
if self.entity:
|
|
d["entity"] = self.entity
|
|
if self.mapping:
|
|
d["mapping"] = self.mapping
|
|
if self.analysis:
|
|
d["analysis"] = self.analysis
|
|
d.update(self.extra)
|
|
return d
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> SchemaRegistry:
|
|
known = {"entity", "mapping", "analysis"}
|
|
extra = {k: v for k, v in data.items() if k not in known}
|
|
return cls(
|
|
entity=data.get("entity", ""),
|
|
mapping=data.get("mapping", ""),
|
|
analysis=data.get("analysis", ""),
|
|
extra=extra,
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class ViabilityThreshold:
|
|
"""Threshold for a single viability metric.
|
|
|
|
At least one of *min* or *max* should be set.
|
|
"""
|
|
|
|
metric: str
|
|
min: Optional[float] = None
|
|
max: Optional[float] = None
|
|
|
|
def check(self, value: float) -> bool:
|
|
"""Return ``True`` if *value* is within the threshold."""
|
|
if self.min is not None and value < self.min:
|
|
return False
|
|
if self.max is not None and value > self.max:
|
|
return False
|
|
return True
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
d: Dict[str, Any] = {}
|
|
if self.min is not None:
|
|
d["min"] = self.min
|
|
if self.max is not None:
|
|
d["max"] = self.max
|
|
return d
|
|
|
|
|
|
@dataclass
|
|
class PipelineStage:
|
|
"""A single stage in the processing pipeline.
|
|
|
|
Attributes:
|
|
template: Path to the template file (relative to infospace root).
|
|
name: Human-readable stage name used in progress output.
|
|
output_dir: Directory for stage outputs (relative to root).
|
|
output_macro: Macro name for this stage's output, also used as
|
|
the filename suffix (e.g. ``entities`` → ``<id>-entities.md``).
|
|
split_entities: If True, parse ``--- ENTITY: <name> ---`` delimiters
|
|
from LLM output and write individual entity files.
|
|
macros: Static macros loaded from files (macro name → relative path).
|
|
spaces: Legacy space IDs for SQLite-based resolver (unused by
|
|
:class:`SourcePipeline`).
|
|
max_tokens: Maximum tokens to request from the LLM for this stage.
|
|
Overrides the pipeline-level default (4096).
|
|
"""
|
|
|
|
template: str
|
|
name: str = ""
|
|
output_dir: str = ""
|
|
output_macro: str = ""
|
|
split_entities: bool = False
|
|
macros: Dict[str, str] = field(default_factory=dict)
|
|
spaces: List[str] = field(default_factory=list)
|
|
max_tokens: Optional[int] = None
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
d: Dict[str, Any] = {"template": self.template}
|
|
if self.name:
|
|
d["name"] = self.name
|
|
if self.output_dir:
|
|
d["output_dir"] = self.output_dir
|
|
if self.output_macro:
|
|
d["output_macro"] = self.output_macro
|
|
if self.split_entities:
|
|
d["split_entities"] = self.split_entities
|
|
if self.macros:
|
|
d["macros"] = self.macros
|
|
if self.spaces:
|
|
d["spaces"] = self.spaces
|
|
if self.max_tokens is not None:
|
|
d["max_tokens"] = self.max_tokens
|
|
return d
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> PipelineStage:
|
|
return cls(
|
|
template=data["template"],
|
|
name=data.get("name", ""),
|
|
output_dir=data.get("output_dir", ""),
|
|
output_macro=data.get("output_macro", ""),
|
|
split_entities=data.get("split_entities", False),
|
|
macros=data.get("macros", {}),
|
|
spaces=data.get("spaces", []),
|
|
max_tokens=data.get("max_tokens"),
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class PipelineConfig:
|
|
"""Processing pipeline configuration."""
|
|
|
|
stages: List[PipelineStage] = field(default_factory=list)
|
|
post_batch: List[PipelineStage] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
d: Dict[str, Any] = {}
|
|
if self.stages:
|
|
d["stages"] = [s.to_dict() for s in self.stages]
|
|
if self.post_batch:
|
|
d["post_batch"] = [s.to_dict() for s in self.post_batch]
|
|
return d
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> PipelineConfig:
|
|
return cls(
|
|
stages=[PipelineStage.from_dict(s) for s in data.get("stages", [])],
|
|
post_batch=[PipelineStage.from_dict(s) for s in data.get("post_batch", [])],
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class InfospaceConfig:
|
|
"""Complete infospace configuration, loaded from ``infospace.yaml``.
|
|
|
|
This is the declarative description of an infospace: what it
|
|
explains, through which lenses, governed by which schemas, and
|
|
what quality thresholds it must meet.
|
|
"""
|
|
|
|
topic: TopicConfig
|
|
disciplines: List[DisciplineBinding] = field(default_factory=list)
|
|
schemas: SchemaRegistry = field(default_factory=SchemaRegistry)
|
|
competency_questions: str = ""
|
|
viability: Dict[str, ViabilityThreshold] = field(default_factory=dict)
|
|
pipeline: Optional[PipelineConfig] = None
|
|
entities_dir: str = "output/entities"
|
|
evaluations_dir: str = "output/evaluations"
|
|
metrics_dir: str = "output/metrics"
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
d: Dict[str, Any] = {"topic": self.topic.to_dict()}
|
|
if self.disciplines:
|
|
d["disciplines"] = [db.to_dict() for db in self.disciplines]
|
|
schemas_dict = self.schemas.to_dict()
|
|
if schemas_dict:
|
|
d["schemas"] = schemas_dict
|
|
if self.competency_questions:
|
|
d["competency_questions"] = self.competency_questions
|
|
if self.viability:
|
|
d["viability"] = {
|
|
name: t.to_dict() for name, t in self.viability.items()
|
|
}
|
|
if self.pipeline:
|
|
d["pipeline"] = self.pipeline.to_dict()
|
|
if self.entities_dir != "output/entities":
|
|
d["entities_dir"] = self.entities_dir
|
|
if self.evaluations_dir != "output/evaluations":
|
|
d["evaluations_dir"] = self.evaluations_dir
|
|
if self.metrics_dir != "output/metrics":
|
|
d["metrics_dir"] = self.metrics_dir
|
|
return d
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> InfospaceConfig:
|
|
viability_raw = data.get("viability", {})
|
|
viability = {
|
|
name: ViabilityThreshold(metric=name, **bounds)
|
|
for name, bounds in viability_raw.items()
|
|
}
|
|
pipeline_raw = data.get("pipeline")
|
|
pipeline = PipelineConfig.from_dict(pipeline_raw) if pipeline_raw else None
|
|
|
|
return cls(
|
|
topic=TopicConfig.from_dict(data["topic"]),
|
|
disciplines=[
|
|
DisciplineBinding.from_dict(d)
|
|
for d in data.get("disciplines", [])
|
|
],
|
|
schemas=SchemaRegistry.from_dict(data.get("schemas", {})),
|
|
competency_questions=data.get("competency_questions", ""),
|
|
viability=viability,
|
|
pipeline=pipeline,
|
|
entities_dir=data.get("entities_dir", "output/entities"),
|
|
evaluations_dir=data.get("evaluations_dir", "output/evaluations"),
|
|
metrics_dir=data.get("metrics_dir", "output/metrics"),
|
|
)
|
|
|
|
|
|
def load_infospace_config(path: Path) -> InfospaceConfig:
|
|
"""Load an :class:`InfospaceConfig` from a YAML file.
|
|
|
|
Args:
|
|
path: Path to ``infospace.yaml``.
|
|
|
|
Raises:
|
|
FileNotFoundError: If *path* does not exist.
|
|
ValueError: If required fields are missing.
|
|
"""
|
|
data = yaml.safe_load(path.read_text(encoding="utf-8"))
|
|
if not isinstance(data, dict):
|
|
raise ValueError(f"Expected a YAML mapping in {path}")
|
|
if "topic" not in data:
|
|
raise ValueError(f"Missing required 'topic' key in {path}")
|
|
return InfospaceConfig.from_dict(data)
|
|
|
|
|
|
def save_infospace_config(config: InfospaceConfig, path: Path) -> None:
|
|
"""Write an :class:`InfospaceConfig` to a YAML file."""
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
path.write_text(
|
|
yaml.safe_dump(
|
|
config.to_dict(),
|
|
default_flow_style=False,
|
|
sort_keys=False,
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
|
|
def find_infospace_config(start: Optional[Path] = None) -> Optional[Path]:
|
|
"""Walk up from *start* looking for ``infospace.yaml``.
|
|
|
|
Returns the path to the config file, or ``None``.
|
|
"""
|
|
current = (start or Path.cwd()).resolve()
|
|
for directory in [current, *current.parents]:
|
|
candidate = directory / "infospace.yaml"
|
|
if candidate.is_file():
|
|
return candidate
|
|
return None
|