feat(infospace): add per-entity evaluation pipeline and CLI command (S2.3)
Evaluation pipeline builds prompts from entity metadata, delegates to BatchEvaluator, parses structured LLM responses into ScoreEntry objects, and writes evaluation files. CLI: 'markitect infospace evaluate' with --provider, --entity, --chapter filters. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -153,6 +153,71 @@ def entities(config_path: Optional[str], sort_key: str):
|
|||||||
click.echo(f"\nTotal: {len(entity_list)} entities")
|
click.echo(f"\nTotal: {len(entity_list)} entities")
|
||||||
|
|
||||||
|
|
||||||
|
# ── evaluate ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@infospace_commands.command()
|
||||||
|
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
|
||||||
|
@click.option("--provider", default="openrouter", help="LLM provider (openrouter, openai, etc.).")
|
||||||
|
@click.option("--model", default=None, help="LLM model name.")
|
||||||
|
@click.option("--entity", "entity_slug", default=None, help="Evaluate a single entity by slug.")
|
||||||
|
@click.option("--chapter", default=None, help="Evaluate entities from a specific chapter.")
|
||||||
|
def evaluate(config_path, provider, model, entity_slug, chapter):
|
||||||
|
"""Evaluate entities using LLM-based quality assessment."""
|
||||||
|
cfg, cfg_path = _load_config_or_exit(config_path)
|
||||||
|
root = cfg_path.parent
|
||||||
|
|
||||||
|
entities_dir = root / cfg.entities_dir
|
||||||
|
if not entities_dir.is_dir():
|
||||||
|
click.echo("Error: No entities directory found.", err=True)
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
entity_list = parse_entity_directory(entities_dir)
|
||||||
|
if not entity_list:
|
||||||
|
click.echo("No entities to evaluate.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Filter
|
||||||
|
if entity_slug:
|
||||||
|
entity_list = [e for e in entity_list if e.slug == entity_slug]
|
||||||
|
if not entity_list:
|
||||||
|
click.echo(f"Error: Entity '{entity_slug}' not found.", err=True)
|
||||||
|
raise SystemExit(1)
|
||||||
|
elif chapter:
|
||||||
|
entity_list = [e for e in entity_list if chapter in e.source_chapter]
|
||||||
|
if not entity_list:
|
||||||
|
click.echo(f"No entities found for chapter '{chapter}'.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Create adapter
|
||||||
|
from markitect.llm import create_adapter
|
||||||
|
from markitect.prompts.execution.models import RunConfig
|
||||||
|
adapter = create_adapter(provider, model=model)
|
||||||
|
run_config = RunConfig(model_name=model or "default", temperature=0.3, max_tokens=2000)
|
||||||
|
|
||||||
|
# Progress callback
|
||||||
|
def on_progress(done, total, result):
|
||||||
|
status = result.status.upper()
|
||||||
|
click.echo(f" [{done}/{total}] {result.key}: {status}")
|
||||||
|
|
||||||
|
click.echo(f"Evaluating {len(entity_list)} entities via {provider}...")
|
||||||
|
|
||||||
|
from markitect.infospace.evaluate import run_entity_evaluation
|
||||||
|
output_dir = root / cfg.evaluations_dir
|
||||||
|
summary = run_entity_evaluation(
|
||||||
|
config=cfg,
|
||||||
|
entities=entity_list,
|
||||||
|
adapter=adapter,
|
||||||
|
run_config=run_config,
|
||||||
|
output_dir=output_dir,
|
||||||
|
progress_callback=on_progress,
|
||||||
|
)
|
||||||
|
|
||||||
|
click.echo(f"\nDone: {summary.succeeded} succeeded, {summary.failed} failed, {summary.skipped} skipped")
|
||||||
|
if summary.total_tokens > 0:
|
||||||
|
click.echo(f"Tokens used: {summary.total_tokens}")
|
||||||
|
|
||||||
|
|
||||||
# ── viability ────────────────────────────────────────────────────────
|
# ── viability ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
215
markitect/infospace/evaluate.py
Normal file
215
markitect/infospace/evaluate.py
Normal file
@@ -0,0 +1,215 @@
|
|||||||
|
"""
|
||||||
|
Per-entity evaluation pipeline.
|
||||||
|
|
||||||
|
Builds prompts from entity metadata and delegates LLM evaluation to
|
||||||
|
the :class:`BatchEvaluator`. Writes structured results to the
|
||||||
|
evaluations directory.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Callable, Dict, List, Optional
|
||||||
|
|
||||||
|
from markitect.infospace.config import InfospaceConfig
|
||||||
|
from markitect.infospace.evaluation import EntityEvaluation, ScoreEntry
|
||||||
|
from markitect.infospace.evaluation_io import write_entity_evaluation
|
||||||
|
from markitect.infospace.models import EntityMeta
|
||||||
|
from markitect.prompts.execution.batch import BatchEvaluator, BatchItem, BatchSummary
|
||||||
|
from markitect.prompts.execution.llm_adapter import LLMAdapter
|
||||||
|
from markitect.prompts.execution.models import RunConfig
|
||||||
|
|
||||||
|
|
||||||
|
_DEFAULT_DIMENSIONS = [
|
||||||
|
"definition_precision",
|
||||||
|
"source_grounding",
|
||||||
|
"domain_relevance",
|
||||||
|
"discipline_alignment",
|
||||||
|
"conceptual_clarity",
|
||||||
|
]
|
||||||
|
|
||||||
|
_PROMPT_TEMPLATE = """\
|
||||||
|
You are evaluating an entity from an infospace about "{topic}".
|
||||||
|
|
||||||
|
## Entity: {title}
|
||||||
|
|
||||||
|
**Slug:** {slug}
|
||||||
|
**Domain:** {domain}
|
||||||
|
**Source chapter:** {source_chapter}
|
||||||
|
|
||||||
|
### Definition
|
||||||
|
{definition}
|
||||||
|
|
||||||
|
### Context
|
||||||
|
{context}
|
||||||
|
|
||||||
|
## Instructions
|
||||||
|
|
||||||
|
Rate this entity on each dimension below using a scale of 1-5 \
|
||||||
|
(1 = poor, 5 = excellent). For each dimension, provide:
|
||||||
|
1. A numeric score (1-5)
|
||||||
|
2. A brief rationale (1-2 sentences)
|
||||||
|
|
||||||
|
### Dimensions to evaluate:
|
||||||
|
{dimensions_list}
|
||||||
|
|
||||||
|
## Output format
|
||||||
|
|
||||||
|
Return your evaluation as a structured list:
|
||||||
|
|
||||||
|
DIMENSION: <name>
|
||||||
|
SCORE: <1-5>
|
||||||
|
RATIONALE: <explanation>
|
||||||
|
|
||||||
|
Repeat for each dimension.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def build_evaluation_prompt(
|
||||||
|
entity: EntityMeta,
|
||||||
|
topic: str,
|
||||||
|
dimensions: Optional[List[str]] = None,
|
||||||
|
) -> str:
|
||||||
|
"""Build an evaluation prompt for a single entity."""
|
||||||
|
dims = dimensions or _DEFAULT_DIMENSIONS
|
||||||
|
dims_list = "\n".join(f"- {d}" for d in dims)
|
||||||
|
return _PROMPT_TEMPLATE.format(
|
||||||
|
topic=topic,
|
||||||
|
title=entity.title,
|
||||||
|
slug=entity.slug,
|
||||||
|
domain=entity.domain or "(unspecified)",
|
||||||
|
source_chapter=entity.source_chapter or "(unspecified)",
|
||||||
|
definition=entity.definition or "(no definition)",
|
||||||
|
context=entity.context or "(no context)",
|
||||||
|
dimensions_list=dims_list,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def content_digest(entity: EntityMeta) -> str:
|
||||||
|
"""Compute a content digest for incremental evaluation."""
|
||||||
|
content = f"{entity.slug}:{entity.definition}:{entity.context}:{entity.domain}"
|
||||||
|
return hashlib.sha256(content.encode()).hexdigest()[:16]
|
||||||
|
|
||||||
|
|
||||||
|
def parse_evaluation_response(
|
||||||
|
response_text: str,
|
||||||
|
dimensions: Optional[List[str]] = None,
|
||||||
|
) -> List[ScoreEntry]:
|
||||||
|
"""Parse structured dimension scores from LLM response text.
|
||||||
|
|
||||||
|
Expects blocks of::
|
||||||
|
|
||||||
|
DIMENSION: <name>
|
||||||
|
SCORE: <1-5>
|
||||||
|
RATIONALE: <text>
|
||||||
|
"""
|
||||||
|
dims = dimensions or _DEFAULT_DIMENSIONS
|
||||||
|
scores: List[ScoreEntry] = []
|
||||||
|
current_dim = None
|
||||||
|
current_score = None
|
||||||
|
current_rationale = ""
|
||||||
|
|
||||||
|
for line in response_text.splitlines():
|
||||||
|
stripped = line.strip()
|
||||||
|
if stripped.upper().startswith("DIMENSION:"):
|
||||||
|
# Flush previous
|
||||||
|
if current_dim is not None and current_score is not None:
|
||||||
|
scores.append(ScoreEntry(
|
||||||
|
name=current_dim,
|
||||||
|
value=current_score,
|
||||||
|
max_value=5.0,
|
||||||
|
rationale=current_rationale.strip(),
|
||||||
|
))
|
||||||
|
current_dim = stripped.split(":", 1)[1].strip()
|
||||||
|
current_score = None
|
||||||
|
current_rationale = ""
|
||||||
|
elif stripped.upper().startswith("SCORE:"):
|
||||||
|
try:
|
||||||
|
current_score = float(stripped.split(":", 1)[1].strip())
|
||||||
|
except ValueError:
|
||||||
|
current_score = None
|
||||||
|
elif stripped.upper().startswith("RATIONALE:"):
|
||||||
|
current_rationale = stripped.split(":", 1)[1].strip()
|
||||||
|
elif current_dim is not None and current_score is not None:
|
||||||
|
# Continuation of rationale
|
||||||
|
if stripped:
|
||||||
|
current_rationale += " " + stripped
|
||||||
|
|
||||||
|
# Flush last
|
||||||
|
if current_dim is not None and current_score is not None:
|
||||||
|
scores.append(ScoreEntry(
|
||||||
|
name=current_dim,
|
||||||
|
value=current_score,
|
||||||
|
max_value=5.0,
|
||||||
|
rationale=current_rationale.strip(),
|
||||||
|
))
|
||||||
|
|
||||||
|
return scores
|
||||||
|
|
||||||
|
|
||||||
|
def run_entity_evaluation(
|
||||||
|
config: InfospaceConfig,
|
||||||
|
entities: List[EntityMeta],
|
||||||
|
adapter: LLMAdapter,
|
||||||
|
run_config: Optional[RunConfig] = None,
|
||||||
|
output_dir: Optional[Path] = None,
|
||||||
|
previous_digests: Optional[Dict[str, str]] = None,
|
||||||
|
progress_callback: Optional[Callable] = None,
|
||||||
|
dimensions: Optional[List[str]] = None,
|
||||||
|
) -> BatchSummary:
|
||||||
|
"""Run per-entity evaluation using the batch evaluator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: The infospace configuration.
|
||||||
|
entities: Entities to evaluate.
|
||||||
|
adapter: LLM adapter for evaluation.
|
||||||
|
run_config: LLM execution configuration.
|
||||||
|
output_dir: Where to write evaluation results. Defaults to
|
||||||
|
``config.evaluations_dir`` relative to CWD.
|
||||||
|
previous_digests: ``{slug: digest}`` for incremental skip.
|
||||||
|
progress_callback: Called after each item.
|
||||||
|
dimensions: Custom evaluation dimensions.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A :class:`BatchSummary` with per-entity results.
|
||||||
|
"""
|
||||||
|
topic = config.topic.name
|
||||||
|
items = [
|
||||||
|
BatchItem(
|
||||||
|
key=entity.slug,
|
||||||
|
prompt=build_evaluation_prompt(entity, topic, dimensions),
|
||||||
|
content_digest=content_digest(entity),
|
||||||
|
metadata={"source_path": entity.source_path},
|
||||||
|
)
|
||||||
|
for entity in entities
|
||||||
|
]
|
||||||
|
|
||||||
|
evaluator = BatchEvaluator(
|
||||||
|
adapter=adapter,
|
||||||
|
config=run_config,
|
||||||
|
progress_callback=progress_callback,
|
||||||
|
previous_digests=previous_digests,
|
||||||
|
)
|
||||||
|
summary = evaluator.evaluate(items)
|
||||||
|
|
||||||
|
# Write successful results
|
||||||
|
evaluations_path = output_dir or Path(config.evaluations_dir)
|
||||||
|
evaluator_name = (run_config.model_name if run_config else "unknown")
|
||||||
|
|
||||||
|
for result in summary.results:
|
||||||
|
if result.status != "success" or result.response is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
scores = parse_evaluation_response(result.response.content, dimensions)
|
||||||
|
evaluation = EntityEvaluation(
|
||||||
|
entity_slug=result.key,
|
||||||
|
evaluator=evaluator_name,
|
||||||
|
scores=scores,
|
||||||
|
evaluated_at=datetime.utcnow(),
|
||||||
|
)
|
||||||
|
eval_path = evaluations_path / f"{result.key}.md"
|
||||||
|
write_entity_evaluation(evaluation, eval_path)
|
||||||
|
|
||||||
|
return summary
|
||||||
224
tests/unit/infospace/test_evaluate.py
Normal file
224
tests/unit/infospace/test_evaluate.py
Normal file
@@ -0,0 +1,224 @@
|
|||||||
|
"""Tests for markitect.infospace.evaluate."""
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from markitect.infospace.config import InfospaceConfig, TopicConfig
|
||||||
|
from markitect.infospace.evaluate import (
|
||||||
|
build_evaluation_prompt,
|
||||||
|
content_digest,
|
||||||
|
parse_evaluation_response,
|
||||||
|
run_entity_evaluation,
|
||||||
|
)
|
||||||
|
from markitect.infospace.evaluation import ScoreEntry
|
||||||
|
from markitect.infospace.models import EntityMeta
|
||||||
|
from markitect.prompts.execution.llm_adapter import MockLLMAdapter
|
||||||
|
from markitect.prompts.execution.models import RunConfig
|
||||||
|
|
||||||
|
|
||||||
|
# ── Helpers ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _entity(**overrides) -> EntityMeta:
|
||||||
|
defaults = dict(
|
||||||
|
slug="division-of-labour",
|
||||||
|
title="Division Of Labour",
|
||||||
|
h1_raw="Division Of Labour",
|
||||||
|
definition="Splitting work into specialised tasks.",
|
||||||
|
source_chapter="Book I Chapter 1",
|
||||||
|
context="Smith introduces the concept early.",
|
||||||
|
domain="Production",
|
||||||
|
source_path="entities/division-of-labour.md",
|
||||||
|
)
|
||||||
|
defaults.update(overrides)
|
||||||
|
return EntityMeta(**defaults)
|
||||||
|
|
||||||
|
|
||||||
|
def _config() -> InfospaceConfig:
|
||||||
|
return InfospaceConfig(topic=TopicConfig(name="The Wealth of Nations"))
|
||||||
|
|
||||||
|
|
||||||
|
_MOCK_RESPONSE = """\
|
||||||
|
DIMENSION: definition_precision
|
||||||
|
SCORE: 4.5
|
||||||
|
RATIONALE: Clear and specific definition of the concept.
|
||||||
|
|
||||||
|
DIMENSION: source_grounding
|
||||||
|
SCORE: 4.0
|
||||||
|
RATIONALE: Well grounded in Smith's text.
|
||||||
|
|
||||||
|
DIMENSION: domain_relevance
|
||||||
|
SCORE: 5.0
|
||||||
|
RATIONALE: Directly relevant to production economics.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# ── build_evaluation_prompt ──────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestBuildPrompt:
|
||||||
|
def test_contains_entity_fields(self):
|
||||||
|
entity = _entity()
|
||||||
|
prompt = build_evaluation_prompt(entity, "Test Topic")
|
||||||
|
assert "division-of-labour" in prompt
|
||||||
|
assert "Division Of Labour" in prompt
|
||||||
|
assert "Production" in prompt
|
||||||
|
assert "Splitting work" in prompt
|
||||||
|
|
||||||
|
def test_contains_topic(self):
|
||||||
|
prompt = build_evaluation_prompt(_entity(), "WoN")
|
||||||
|
assert "WoN" in prompt
|
||||||
|
|
||||||
|
def test_contains_dimensions(self):
|
||||||
|
prompt = build_evaluation_prompt(_entity(), "T")
|
||||||
|
assert "definition_precision" in prompt
|
||||||
|
assert "source_grounding" in prompt
|
||||||
|
|
||||||
|
def test_custom_dimensions(self):
|
||||||
|
prompt = build_evaluation_prompt(
|
||||||
|
_entity(), "T", dimensions=["novelty", "coherence"]
|
||||||
|
)
|
||||||
|
assert "novelty" in prompt
|
||||||
|
assert "coherence" in prompt
|
||||||
|
assert "definition_precision" not in prompt
|
||||||
|
|
||||||
|
def test_handles_missing_fields(self):
|
||||||
|
entity = _entity(definition="", context="", domain="")
|
||||||
|
prompt = build_evaluation_prompt(entity, "T")
|
||||||
|
assert "(no definition)" in prompt
|
||||||
|
assert "(no context)" in prompt
|
||||||
|
assert "(unspecified)" in prompt
|
||||||
|
|
||||||
|
|
||||||
|
# ── content_digest ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestContentDigest:
|
||||||
|
def test_deterministic(self):
|
||||||
|
e = _entity()
|
||||||
|
assert content_digest(e) == content_digest(e)
|
||||||
|
|
||||||
|
def test_changes_with_content(self):
|
||||||
|
e1 = _entity(definition="A")
|
||||||
|
e2 = _entity(definition="B")
|
||||||
|
assert content_digest(e1) != content_digest(e2)
|
||||||
|
|
||||||
|
|
||||||
|
# ── parse_evaluation_response ────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestParseResponse:
|
||||||
|
def test_parses_three_dimensions(self):
|
||||||
|
scores = parse_evaluation_response(_MOCK_RESPONSE)
|
||||||
|
assert len(scores) == 3
|
||||||
|
|
||||||
|
def test_correct_names(self):
|
||||||
|
scores = parse_evaluation_response(_MOCK_RESPONSE)
|
||||||
|
names = [s.name for s in scores]
|
||||||
|
assert "definition_precision" in names
|
||||||
|
assert "source_grounding" in names
|
||||||
|
assert "domain_relevance" in names
|
||||||
|
|
||||||
|
def test_correct_scores(self):
|
||||||
|
scores = parse_evaluation_response(_MOCK_RESPONSE)
|
||||||
|
by_name = {s.name: s for s in scores}
|
||||||
|
assert by_name["definition_precision"].value == 4.5
|
||||||
|
assert by_name["source_grounding"].value == 4.0
|
||||||
|
assert by_name["domain_relevance"].value == 5.0
|
||||||
|
|
||||||
|
def test_correct_rationales(self):
|
||||||
|
scores = parse_evaluation_response(_MOCK_RESPONSE)
|
||||||
|
by_name = {s.name: s for s in scores}
|
||||||
|
assert "Clear" in by_name["definition_precision"].rationale
|
||||||
|
|
||||||
|
def test_empty_response(self):
|
||||||
|
scores = parse_evaluation_response("")
|
||||||
|
assert scores == []
|
||||||
|
|
||||||
|
def test_malformed_score_skipped(self):
|
||||||
|
text = "DIMENSION: x\nSCORE: not-a-number\nRATIONALE: oops"
|
||||||
|
scores = parse_evaluation_response(text)
|
||||||
|
assert len(scores) == 0
|
||||||
|
|
||||||
|
|
||||||
|
# ── run_entity_evaluation ────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestRunEntityEvaluation:
|
||||||
|
def test_evaluates_entities(self, tmp_path):
|
||||||
|
adapter = MockLLMAdapter(_MOCK_RESPONSE)
|
||||||
|
cfg = _config()
|
||||||
|
entities = [_entity(), _entity(slug="pin-factory", title="Pin Factory")]
|
||||||
|
|
||||||
|
summary = run_entity_evaluation(
|
||||||
|
config=cfg,
|
||||||
|
entities=entities,
|
||||||
|
adapter=adapter,
|
||||||
|
output_dir=tmp_path / "evals",
|
||||||
|
)
|
||||||
|
assert summary.total == 2
|
||||||
|
assert summary.succeeded == 2
|
||||||
|
assert adapter.call_count == 2
|
||||||
|
|
||||||
|
def test_writes_evaluation_files(self, tmp_path):
|
||||||
|
adapter = MockLLMAdapter(_MOCK_RESPONSE)
|
||||||
|
cfg = _config()
|
||||||
|
entities = [_entity()]
|
||||||
|
|
||||||
|
run_entity_evaluation(
|
||||||
|
config=cfg,
|
||||||
|
entities=entities,
|
||||||
|
adapter=adapter,
|
||||||
|
output_dir=tmp_path / "evals",
|
||||||
|
)
|
||||||
|
eval_file = tmp_path / "evals" / "division-of-labour.md"
|
||||||
|
assert eval_file.exists()
|
||||||
|
text = eval_file.read_text()
|
||||||
|
assert "definition_precision" in text
|
||||||
|
|
||||||
|
def test_incremental_skip(self, tmp_path):
|
||||||
|
adapter = MockLLMAdapter(_MOCK_RESPONSE)
|
||||||
|
cfg = _config()
|
||||||
|
entity = _entity()
|
||||||
|
digest = content_digest(entity)
|
||||||
|
|
||||||
|
summary = run_entity_evaluation(
|
||||||
|
config=cfg,
|
||||||
|
entities=[entity],
|
||||||
|
adapter=adapter,
|
||||||
|
output_dir=tmp_path,
|
||||||
|
previous_digests={entity.slug: digest},
|
||||||
|
)
|
||||||
|
assert summary.skipped == 1
|
||||||
|
assert adapter.call_count == 0
|
||||||
|
|
||||||
|
def test_progress_callback_called(self, tmp_path):
|
||||||
|
adapter = MockLLMAdapter(_MOCK_RESPONSE)
|
||||||
|
cfg = _config()
|
||||||
|
calls = []
|
||||||
|
|
||||||
|
run_entity_evaluation(
|
||||||
|
config=cfg,
|
||||||
|
entities=[_entity()],
|
||||||
|
adapter=adapter,
|
||||||
|
output_dir=tmp_path,
|
||||||
|
progress_callback=lambda d, t, r: calls.append((d, t, r.key)),
|
||||||
|
)
|
||||||
|
assert len(calls) == 1
|
||||||
|
assert calls[0] == (1, 1, "division-of-labour")
|
||||||
|
|
||||||
|
def test_passes_run_config(self, tmp_path):
|
||||||
|
adapter = MockLLMAdapter(_MOCK_RESPONSE)
|
||||||
|
cfg = _config()
|
||||||
|
rc = RunConfig(temperature=0.1, max_tokens=500)
|
||||||
|
|
||||||
|
run_entity_evaluation(
|
||||||
|
config=cfg,
|
||||||
|
entities=[_entity()],
|
||||||
|
adapter=adapter,
|
||||||
|
run_config=rc,
|
||||||
|
output_dir=tmp_path,
|
||||||
|
)
|
||||||
|
assert adapter.last_config.temperature == 0.1
|
||||||
Reference in New Issue
Block a user