Some checks failed
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
Three coordinated changes that let the pipeline produce a clean chapter-by-chapter git history on long texts without archaeology after the fact. 1. Richer commit messages. `SourcePipeline._git_commit` now diffs the staged changes, buckets added files by output subdirectory (entities, evaluations, classifications, mappings, analyses, metrics, logs), and includes counts in the commit body. So `git log` reads "entities: +23, evaluations: +23" per chapter instead of the same generic blurb on every commit. Zero behaviour change when no output changed; falls back to the original message if the diff query fails. 2. --eval-after-source / --classify-after-source on `infospace process`. After a source's stages succeed, the pipeline identifies which entity files are *new* (set diff of entity slugs before vs after), loads their EntityMeta, and runs per-entity evaluation and/or classification scoped to just those slugs before the per-source git commit lands. Result: each chapter's commit is self-contained — extraction + evaluation + classification in one atomic unit. Gated behind explicit flags because the cost is real (LLM latency per chapter rather than amortised across one bulk batch). 3. `markitect infospace chapters` subcommand. Lists source files in canonical order with entity count, evaluated count, classified count, and mean per-entity score per source. Text or JSON output. Natural triage surface for long-text infospaces — spot chapters that under-extracted or evaluated poorly. Also: `docs/advanced-usage.md` gets a new "Systematic processing of long texts" section with the recommended flag combo and the tradeoff note on cost. 11 new unit tests cover the chapters command (text/json/no-sources), the process flag wiring (help + provider requirement), and the commit-body bucket logic. Full infospace+llm unit suite (315 tests) green; 3 pre-existing infospace failures unchanged. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
352 lines
13 KiB
Python
352 lines
13 KiB
Python
"""Tests for markitect.infospace.cli."""
|
|
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
from click.testing import CliRunner
|
|
|
|
from markitect.infospace.cli import infospace_commands
|
|
|
|
|
|
@pytest.fixture
|
|
def runner():
|
|
return CliRunner()
|
|
|
|
|
|
@pytest.fixture
|
|
def infospace_dir(tmp_path):
|
|
"""Create a minimal infospace directory with config and entities."""
|
|
config_yaml = """\
|
|
topic:
|
|
name: "Test Infospace"
|
|
domain: "Testing"
|
|
|
|
disciplines:
|
|
- name: "Test Discipline"
|
|
|
|
viability:
|
|
coverage_ratio:
|
|
min: 0.60
|
|
redundancy_ratio:
|
|
max: 0.05
|
|
"""
|
|
(tmp_path / "infospace.yaml").write_text(config_yaml)
|
|
|
|
entities = tmp_path / "output" / "entities"
|
|
entities.mkdir(parents=True)
|
|
(entities / "alpha.md").write_text(
|
|
"# Alpha\n\n## Definition\n\nAlpha is a test entity.\n\n"
|
|
"## Source Chapter\n\nChapter 1\n\n"
|
|
"## Domain\n\nProduction\n"
|
|
)
|
|
(entities / "beta.md").write_text(
|
|
"# Beta\n\n## Definition\n\nBeta is another test entity with more words "
|
|
"to make it longer.\n\n"
|
|
"## Source Chapter\n\nChapter 2\n\n"
|
|
"## Domain\n\nDistribution\n"
|
|
)
|
|
return tmp_path
|
|
|
|
|
|
# ── init ─────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestInitCommand:
|
|
def test_creates_config_file(self, runner, tmp_path):
|
|
out = tmp_path / "infospace.yaml"
|
|
result = runner.invoke(
|
|
infospace_commands,
|
|
["init", "--topic", "My Topic", "--domain", "Science", "-o", str(out)],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert out.exists()
|
|
assert "Created" in result.output
|
|
|
|
def test_config_contains_topic(self, runner, tmp_path):
|
|
out = tmp_path / "infospace.yaml"
|
|
runner.invoke(
|
|
infospace_commands,
|
|
["init", "--topic", "My Topic", "-o", str(out)],
|
|
)
|
|
text = out.read_text()
|
|
assert "My Topic" in text
|
|
|
|
def test_refuses_overwrite(self, runner, tmp_path):
|
|
out = tmp_path / "infospace.yaml"
|
|
out.write_text("existing")
|
|
result = runner.invoke(
|
|
infospace_commands,
|
|
["init", "--topic", "X", "-o", str(out)],
|
|
)
|
|
assert result.exit_code != 0
|
|
assert "already exists" in result.output
|
|
|
|
def test_with_disciplines(self, runner, tmp_path):
|
|
out = tmp_path / "infospace.yaml"
|
|
result = runner.invoke(
|
|
infospace_commands,
|
|
[
|
|
"init", "--topic", "T",
|
|
"--discipline", "VSM",
|
|
"--discipline", "Category Theory",
|
|
"-o", str(out),
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
text = out.read_text()
|
|
assert "VSM" in text
|
|
assert "Category Theory" in text
|
|
|
|
|
|
# ── status ───────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestStatusCommand:
|
|
def test_shows_topic_and_count(self, runner, infospace_dir):
|
|
result = runner.invoke(
|
|
infospace_commands,
|
|
["status", "--config", str(infospace_dir / "infospace.yaml")],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "Test Infospace" in result.output
|
|
assert "2" in result.output # 2 entities
|
|
|
|
def test_shows_domain_field(self, runner, infospace_dir):
|
|
result = runner.invoke(
|
|
infospace_commands,
|
|
["status", "--config", str(infospace_dir / "infospace.yaml")],
|
|
)
|
|
# Domain from config (topic.domain), not entity domains
|
|
assert "Testing" in result.output
|
|
|
|
def test_shows_disciplines(self, runner, infospace_dir):
|
|
result = runner.invoke(
|
|
infospace_commands,
|
|
["status", "--config", str(infospace_dir / "infospace.yaml")],
|
|
)
|
|
assert "Test Discipline" in result.output
|
|
|
|
def test_no_config_exits(self, runner, tmp_path):
|
|
result = runner.invoke(
|
|
infospace_commands,
|
|
["status", "--config", str(tmp_path / "nonexistent.yaml")],
|
|
)
|
|
assert result.exit_code != 0
|
|
|
|
|
|
# ── entities ─────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestEntitiesCommand:
|
|
def test_lists_entities(self, runner, infospace_dir):
|
|
result = runner.invoke(
|
|
infospace_commands,
|
|
["entities", "--config", str(infospace_dir / "infospace.yaml")],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "alpha" in result.output
|
|
assert "beta" in result.output
|
|
assert "Total: 2" in result.output
|
|
|
|
def test_sort_by_domain(self, runner, infospace_dir):
|
|
result = runner.invoke(
|
|
infospace_commands,
|
|
[
|
|
"entities",
|
|
"--config", str(infospace_dir / "infospace.yaml"),
|
|
"--sort-by", "domain",
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
lines = result.output.strip().split("\n")
|
|
# Distribution comes before Production alphabetically
|
|
data_lines = [l for l in lines if "alpha" in l or "beta" in l]
|
|
assert len(data_lines) == 2
|
|
|
|
def test_no_entities_dir(self, runner, tmp_path):
|
|
(tmp_path / "infospace.yaml").write_text("topic:\n name: X\n")
|
|
result = runner.invoke(
|
|
infospace_commands,
|
|
["entities", "--config", str(tmp_path / "infospace.yaml")],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "No entities" in result.output
|
|
|
|
|
|
# ── viability ────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestViabilityCommand:
|
|
def test_no_metrics_shows_thresholds(self, runner, infospace_dir):
|
|
result = runner.invoke(
|
|
infospace_commands,
|
|
["viability", "--config", str(infospace_dir / "infospace.yaml")],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "coverage_ratio" in result.output
|
|
|
|
def test_with_metrics_file(self, runner, infospace_dir):
|
|
import yaml
|
|
metrics_dir = infospace_dir / "output" / "metrics"
|
|
metrics_dir.mkdir(parents=True, exist_ok=True)
|
|
metrics = {"coverage_ratio": 0.85, "redundancy_ratio": 0.02}
|
|
(metrics_dir / "metrics.yaml").write_text(yaml.safe_dump(metrics))
|
|
|
|
result = runner.invoke(
|
|
infospace_commands,
|
|
["viability", "--config", str(infospace_dir / "infospace.yaml")],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "PASS" in result.output
|
|
assert "Viable: YES" in result.output
|
|
|
|
def test_failing_threshold(self, runner, infospace_dir):
|
|
import yaml
|
|
metrics_dir = infospace_dir / "output" / "metrics"
|
|
metrics_dir.mkdir(parents=True, exist_ok=True)
|
|
metrics = {"coverage_ratio": 0.3, "redundancy_ratio": 0.02}
|
|
(metrics_dir / "metrics.yaml").write_text(yaml.safe_dump(metrics))
|
|
|
|
result = runner.invoke(
|
|
infospace_commands,
|
|
["viability", "--config", str(infospace_dir / "infospace.yaml")],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "FAIL" in result.output
|
|
assert "Viable: NO" in result.output
|
|
|
|
def test_no_thresholds_configured(self, runner, tmp_path):
|
|
(tmp_path / "infospace.yaml").write_text("topic:\n name: X\n")
|
|
result = runner.invoke(
|
|
infospace_commands,
|
|
["viability", "--config", str(tmp_path / "infospace.yaml")],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "No viability thresholds" in result.output
|
|
|
|
|
|
# ── chapters (per-source triage view) ────────────────────────────────
|
|
|
|
|
|
class TestChaptersCommand:
|
|
@pytest.fixture
|
|
def chapters_dir(self, tmp_path):
|
|
"""Infospace with 2 source files and matching entities."""
|
|
config_yaml = """\
|
|
topic:
|
|
name: "WoN"
|
|
domain: "Economics"
|
|
sources: artifacts/sources
|
|
"""
|
|
(tmp_path / "infospace.yaml").write_text(config_yaml)
|
|
|
|
sources = tmp_path / "artifacts" / "sources"
|
|
sources.mkdir(parents=True)
|
|
(sources / "book-1-chapter-01.md").write_text("# Chapter 1\n\nText.\n")
|
|
(sources / "book-1-chapter-02.md").write_text("# Chapter 2\n\nText.\n")
|
|
|
|
entities = tmp_path / "output" / "entities"
|
|
entities.mkdir(parents=True)
|
|
(entities / "alpha.md").write_text(
|
|
"# Alpha\n\n## Definition\n\nX.\n\n"
|
|
"## Source Chapter\n\nBook I, Chapter 1\n"
|
|
)
|
|
(entities / "beta.md").write_text(
|
|
"# Beta\n\n## Definition\n\nY.\n\n"
|
|
"## Source Chapter\n\nBook I, Chapter 2\n"
|
|
)
|
|
(entities / "gamma.md").write_text(
|
|
"# Gamma\n\n## Definition\n\nZ.\n\n"
|
|
"## Source Chapter\n\nBook I, Chapter 2\n"
|
|
)
|
|
return tmp_path
|
|
|
|
def test_lists_sources_with_counts(self, runner, chapters_dir):
|
|
result = runner.invoke(
|
|
infospace_commands,
|
|
["chapters", "--config", str(chapters_dir / "infospace.yaml")],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "book-1-chapter-01" in result.output
|
|
assert "book-1-chapter-02" in result.output
|
|
# ch 1 -> 1 entity, ch 2 -> 2 entities
|
|
assert "2 source file(s); 3 entities" in result.output
|
|
|
|
def test_json_format(self, runner, chapters_dir):
|
|
result = runner.invoke(
|
|
infospace_commands,
|
|
["chapters", "--config", str(chapters_dir / "infospace.yaml"),
|
|
"--format", "json"],
|
|
)
|
|
assert result.exit_code == 0
|
|
import json
|
|
rows = json.loads(result.output)
|
|
by_id = {r["source_id"]: r for r in rows}
|
|
assert by_id["book-1-chapter-01"]["entities"] == 1
|
|
assert by_id["book-1-chapter-02"]["entities"] == 2
|
|
|
|
def test_no_sources_dir(self, runner, tmp_path):
|
|
(tmp_path / "infospace.yaml").write_text(
|
|
"topic:\n name: X\n sources: missing\n"
|
|
)
|
|
result = runner.invoke(
|
|
infospace_commands,
|
|
["chapters", "--config", str(tmp_path / "infospace.yaml")],
|
|
)
|
|
assert result.exit_code == 1
|
|
|
|
|
|
# ── process: eval-after-source / classify-after-source flags ─────────
|
|
|
|
|
|
class TestProcessAfterSourceFlags:
|
|
def test_flags_registered_in_help(self, runner):
|
|
result = runner.invoke(infospace_commands, ["process", "--help"])
|
|
assert result.exit_code == 0
|
|
assert "--eval-after-source" in result.output
|
|
assert "--classify-after-source" in result.output
|
|
|
|
def test_flags_require_provider(self, runner, tmp_path):
|
|
(tmp_path / "infospace.yaml").write_text(
|
|
"topic:\n name: X\n sources: sources\n"
|
|
"pipeline:\n stages:\n - template: extract-entities\n"
|
|
)
|
|
sources = tmp_path / "sources"
|
|
sources.mkdir()
|
|
(sources / "s1.md").write_text("source")
|
|
result = runner.invoke(
|
|
infospace_commands,
|
|
["process", "--all",
|
|
"--config", str(tmp_path / "infospace.yaml"),
|
|
"--eval-after-source"],
|
|
)
|
|
assert result.exit_code == 1
|
|
assert "require --provider" in result.output
|
|
|
|
|
|
# ── pipeline: commit body composition ────────────────────────────────
|
|
|
|
|
|
class TestCommitBodyComposition:
|
|
def test_bucket_for(self, tmp_path):
|
|
from markitect.infospace.config import InfospaceConfig, TopicConfig
|
|
from markitect.infospace.pipeline import SourcePipeline
|
|
cfg = InfospaceConfig(topic=TopicConfig(name="T", domain="D"))
|
|
p = SourcePipeline(cfg, tmp_path)
|
|
assert p._bucket_for("output/entities/x.md") == "entities"
|
|
assert p._bucket_for("output/evaluations/x.md") == "evaluations"
|
|
assert p._bucket_for("output/classifications/x.md") == "classifications"
|
|
assert p._bucket_for("output/mappings/x.md") == "mappings"
|
|
assert p._bucket_for("output/notes/x.md") == "other"
|
|
assert p._bucket_for("README.md") is None # not under output/
|
|
|
|
def test_compose_body_uses_default_on_no_diff(self, tmp_path):
|
|
"""When git diff fails or returns empty, fall back to the default blurb."""
|
|
from markitect.infospace.config import InfospaceConfig, TopicConfig
|
|
from markitect.infospace.pipeline import SourcePipeline
|
|
cfg = InfospaceConfig(topic=TopicConfig(name="T", domain="D"))
|
|
# Not a git repo, so `git diff --cached` will raise CalledProcessError.
|
|
p = SourcePipeline(cfg, tmp_path)
|
|
body = p._compose_commit_body("some-source")
|
|
assert "Extract entities" in body
|