feat(infospace): systematic long-text processing — rich commit bodies, per-source eval/classify, chapters view
Some checks failed
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled

Three coordinated changes that let the pipeline produce a clean
chapter-by-chapter git history on long texts without archaeology after
the fact.

1. Richer commit messages. `SourcePipeline._git_commit` now diffs the
   staged changes, buckets added files by output subdirectory (entities,
   evaluations, classifications, mappings, analyses, metrics, logs), and
   includes counts in the commit body. So `git log` reads "entities:
   +23, evaluations: +23" per chapter instead of the same generic blurb
   on every commit. Zero behaviour change when no output changed; falls
   back to the original message if the diff query fails.

2. --eval-after-source / --classify-after-source on `infospace process`.
   After a source's stages succeed, the pipeline identifies which entity
   files are *new* (set diff of entity slugs before vs after), loads
   their EntityMeta, and runs per-entity evaluation and/or
   classification scoped to just those slugs before the per-source git
   commit lands. Result: each chapter's commit is self-contained —
   extraction + evaluation + classification in one atomic unit. Gated
   behind explicit flags because the cost is real (LLM latency per
   chapter rather than amortised across one bulk batch).

3. `markitect infospace chapters` subcommand. Lists source files in
   canonical order with entity count, evaluated count, classified
   count, and mean per-entity score per source. Text or JSON output.
   Natural triage surface for long-text infospaces — spot chapters that
   under-extracted or evaluated poorly.

Also: `docs/advanced-usage.md` gets a new "Systematic processing of
long texts" section with the recommended flag combo and the tradeoff
note on cost.

11 new unit tests cover the chapters command (text/json/no-sources),
the process flag wiring (help + provider requirement), and the
commit-body bucket logic. Full infospace+llm unit suite (315 tests)
green; 3 pre-existing infospace failures unchanged.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-22 08:24:26 +02:00
parent 9e8d73fa7d
commit e3e5b8ecc1
4 changed files with 501 additions and 4 deletions

View File

@@ -223,3 +223,129 @@ class TestViabilityCommand:
)
assert result.exit_code == 0
assert "No viability thresholds" in result.output
# ── chapters (per-source triage view) ────────────────────────────────
class TestChaptersCommand:
@pytest.fixture
def chapters_dir(self, tmp_path):
"""Infospace with 2 source files and matching entities."""
config_yaml = """\
topic:
name: "WoN"
domain: "Economics"
sources: artifacts/sources
"""
(tmp_path / "infospace.yaml").write_text(config_yaml)
sources = tmp_path / "artifacts" / "sources"
sources.mkdir(parents=True)
(sources / "book-1-chapter-01.md").write_text("# Chapter 1\n\nText.\n")
(sources / "book-1-chapter-02.md").write_text("# Chapter 2\n\nText.\n")
entities = tmp_path / "output" / "entities"
entities.mkdir(parents=True)
(entities / "alpha.md").write_text(
"# Alpha\n\n## Definition\n\nX.\n\n"
"## Source Chapter\n\nBook I, Chapter 1\n"
)
(entities / "beta.md").write_text(
"# Beta\n\n## Definition\n\nY.\n\n"
"## Source Chapter\n\nBook I, Chapter 2\n"
)
(entities / "gamma.md").write_text(
"# Gamma\n\n## Definition\n\nZ.\n\n"
"## Source Chapter\n\nBook I, Chapter 2\n"
)
return tmp_path
def test_lists_sources_with_counts(self, runner, chapters_dir):
result = runner.invoke(
infospace_commands,
["chapters", "--config", str(chapters_dir / "infospace.yaml")],
)
assert result.exit_code == 0
assert "book-1-chapter-01" in result.output
assert "book-1-chapter-02" in result.output
# ch 1 -> 1 entity, ch 2 -> 2 entities
assert "2 source file(s); 3 entities" in result.output
def test_json_format(self, runner, chapters_dir):
result = runner.invoke(
infospace_commands,
["chapters", "--config", str(chapters_dir / "infospace.yaml"),
"--format", "json"],
)
assert result.exit_code == 0
import json
rows = json.loads(result.output)
by_id = {r["source_id"]: r for r in rows}
assert by_id["book-1-chapter-01"]["entities"] == 1
assert by_id["book-1-chapter-02"]["entities"] == 2
def test_no_sources_dir(self, runner, tmp_path):
(tmp_path / "infospace.yaml").write_text(
"topic:\n name: X\n sources: missing\n"
)
result = runner.invoke(
infospace_commands,
["chapters", "--config", str(tmp_path / "infospace.yaml")],
)
assert result.exit_code == 1
# ── process: eval-after-source / classify-after-source flags ─────────
class TestProcessAfterSourceFlags:
def test_flags_registered_in_help(self, runner):
result = runner.invoke(infospace_commands, ["process", "--help"])
assert result.exit_code == 0
assert "--eval-after-source" in result.output
assert "--classify-after-source" in result.output
def test_flags_require_provider(self, runner, tmp_path):
(tmp_path / "infospace.yaml").write_text(
"topic:\n name: X\n sources: sources\n"
"pipeline:\n stages:\n - template: extract-entities\n"
)
sources = tmp_path / "sources"
sources.mkdir()
(sources / "s1.md").write_text("source")
result = runner.invoke(
infospace_commands,
["process", "--all",
"--config", str(tmp_path / "infospace.yaml"),
"--eval-after-source"],
)
assert result.exit_code == 1
assert "require --provider" in result.output
# ── pipeline: commit body composition ────────────────────────────────
class TestCommitBodyComposition:
def test_bucket_for(self, tmp_path):
from markitect.infospace.config import InfospaceConfig, TopicConfig
from markitect.infospace.pipeline import SourcePipeline
cfg = InfospaceConfig(topic=TopicConfig(name="T", domain="D"))
p = SourcePipeline(cfg, tmp_path)
assert p._bucket_for("output/entities/x.md") == "entities"
assert p._bucket_for("output/evaluations/x.md") == "evaluations"
assert p._bucket_for("output/classifications/x.md") == "classifications"
assert p._bucket_for("output/mappings/x.md") == "mappings"
assert p._bucket_for("output/notes/x.md") == "other"
assert p._bucket_for("README.md") is None # not under output/
def test_compose_body_uses_default_on_no_diff(self, tmp_path):
"""When git diff fails or returns empty, fall back to the default blurb."""
from markitect.infospace.config import InfospaceConfig, TopicConfig
from markitect.infospace.pipeline import SourcePipeline
cfg = InfospaceConfig(topic=TopicConfig(name="T", domain="D"))
# Not a git repo, so `git diff --cached` will raise CalledProcessError.
p = SourcePipeline(cfg, tmp_path)
body = p._compose_commit_body("some-source")
assert "Extract entities" in body