feat(infospace): systematic long-text processing — rich commit bodies, per-source eval/classify, chapters view

Three coordinated changes that let the pipeline produce a clean chapter-by-chapter git history on long texts without archaeology after the fact. 1. Richer commit messages. `SourcePipeline._git_commit` now diffs the staged changes, buckets added files by output subdirectory (entities, evaluations, classifications, mappings, analyses, metrics, logs), and includes counts in the commit body. So `git log` reads "entities: +23, evaluations: +23" per chapter instead of the same generic blurb on every commit. Zero behaviour change when no output changed; falls back to the original message if the diff query fails. 2. --eval-after-source / --classify-after-source on `infospace process`. After a source's stages succeed, the pipeline identifies which entity files are *new* (set diff of entity slugs before vs after), loads their EntityMeta, and runs per-entity evaluation and/or classification scoped to just those slugs before the per-source git commit lands. Result: each chapter's commit is self-contained — extraction + evaluation + classification in one atomic unit. Gated behind explicit flags because the cost is real (LLM latency per chapter rather than amortised across one bulk batch). 3. `markitect infospace chapters` subcommand. Lists source files in canonical order with entity count, evaluated count, classified count, and mean per-entity score per source. Text or JSON output. Natural triage surface for long-text infospaces — spot chapters that under-extracted or evaluated poorly. Also: `docs/advanced-usage.md` gets a new "Systematic processing of long texts" section with the recommended flag combo and the tradeoff note on cost. 11 new unit tests cover the chapters command (text/json/no-sources), the process flag wiring (help + provider requirement), and the commit-body bucket logic. Full infospace+llm unit suite (315 tests) green; 3 pre-existing infospace failures unchanged. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-22 08:24:26 +02:00
parent 9e8d73fa7d
commit e3e5b8ecc1
4 changed files with 501 additions and 4 deletions
--- a/tests/unit/infospace/test_cli.py
+++ b/tests/unit/infospace/test_cli.py
@@ -223,3 +223,129 @@ class TestViabilityCommand:
        )
        assert result.exit_code == 0
        assert "No viability thresholds" in result.output
+
+
+# ── chapters (per-source triage view) ────────────────────────────────
+
+
+class TestChaptersCommand:
+    @pytest.fixture
+    def chapters_dir(self, tmp_path):
+        """Infospace with 2 source files and matching entities."""
+        config_yaml = """\
+topic:
+  name: "WoN"
+  domain: "Economics"
+  sources: artifacts/sources
+"""
+        (tmp_path / "infospace.yaml").write_text(config_yaml)
+
+        sources = tmp_path / "artifacts" / "sources"
+        sources.mkdir(parents=True)
+        (sources / "book-1-chapter-01.md").write_text("# Chapter 1\n\nText.\n")
+        (sources / "book-1-chapter-02.md").write_text("# Chapter 2\n\nText.\n")
+
+        entities = tmp_path / "output" / "entities"
+        entities.mkdir(parents=True)
+        (entities / "alpha.md").write_text(
+            "# Alpha\n\n## Definition\n\nX.\n\n"
+            "## Source Chapter\n\nBook I, Chapter 1\n"
+        )
+        (entities / "beta.md").write_text(
+            "# Beta\n\n## Definition\n\nY.\n\n"
+            "## Source Chapter\n\nBook I, Chapter 2\n"
+        )
+        (entities / "gamma.md").write_text(
+            "# Gamma\n\n## Definition\n\nZ.\n\n"
+            "## Source Chapter\n\nBook I, Chapter 2\n"
+        )
+        return tmp_path
+
+    def test_lists_sources_with_counts(self, runner, chapters_dir):
+        result = runner.invoke(
+            infospace_commands,
+            ["chapters", "--config", str(chapters_dir / "infospace.yaml")],
+        )
+        assert result.exit_code == 0
+        assert "book-1-chapter-01" in result.output
+        assert "book-1-chapter-02" in result.output
+        # ch 1 -> 1 entity, ch 2 -> 2 entities
+        assert "2 source file(s); 3 entities" in result.output
+
+    def test_json_format(self, runner, chapters_dir):
+        result = runner.invoke(
+            infospace_commands,
+            ["chapters", "--config", str(chapters_dir / "infospace.yaml"),
+             "--format", "json"],
+        )
+        assert result.exit_code == 0
+        import json
+        rows = json.loads(result.output)
+        by_id = {r["source_id"]: r for r in rows}
+        assert by_id["book-1-chapter-01"]["entities"] == 1
+        assert by_id["book-1-chapter-02"]["entities"] == 2
+
+    def test_no_sources_dir(self, runner, tmp_path):
+        (tmp_path / "infospace.yaml").write_text(
+            "topic:\n  name: X\n  sources: missing\n"
+        )
+        result = runner.invoke(
+            infospace_commands,
+            ["chapters", "--config", str(tmp_path / "infospace.yaml")],
+        )
+        assert result.exit_code == 1
+
+
+# ── process: eval-after-source / classify-after-source flags ─────────
+
+
+class TestProcessAfterSourceFlags:
+    def test_flags_registered_in_help(self, runner):
+        result = runner.invoke(infospace_commands, ["process", "--help"])
+        assert result.exit_code == 0
+        assert "--eval-after-source" in result.output
+        assert "--classify-after-source" in result.output
+
+    def test_flags_require_provider(self, runner, tmp_path):
+        (tmp_path / "infospace.yaml").write_text(
+            "topic:\n  name: X\n  sources: sources\n"
+            "pipeline:\n  stages:\n    - template: extract-entities\n"
+        )
+        sources = tmp_path / "sources"
+        sources.mkdir()
+        (sources / "s1.md").write_text("source")
+        result = runner.invoke(
+            infospace_commands,
+            ["process", "--all",
+             "--config", str(tmp_path / "infospace.yaml"),
+             "--eval-after-source"],
+        )
+        assert result.exit_code == 1
+        assert "require --provider" in result.output
+
+
+# ── pipeline: commit body composition ────────────────────────────────
+
+
+class TestCommitBodyComposition:
+    def test_bucket_for(self, tmp_path):
+        from markitect.infospace.config import InfospaceConfig, TopicConfig
+        from markitect.infospace.pipeline import SourcePipeline
+        cfg = InfospaceConfig(topic=TopicConfig(name="T", domain="D"))
+        p = SourcePipeline(cfg, tmp_path)
+        assert p._bucket_for("output/entities/x.md") == "entities"
+        assert p._bucket_for("output/evaluations/x.md") == "evaluations"
+        assert p._bucket_for("output/classifications/x.md") == "classifications"
+        assert p._bucket_for("output/mappings/x.md") == "mappings"
+        assert p._bucket_for("output/notes/x.md") == "other"
+        assert p._bucket_for("README.md") is None  # not under output/
+
+    def test_compose_body_uses_default_on_no_diff(self, tmp_path):
+        """When git diff fails or returns empty, fall back to the default blurb."""
+        from markitect.infospace.config import InfospaceConfig, TopicConfig
+        from markitect.infospace.pipeline import SourcePipeline
+        cfg = InfospaceConfig(topic=TopicConfig(name="T", domain="D"))
+        # Not a git repo, so `git diff --cached` will raise CalledProcessError.
+        p = SourcePipeline(cfg, tmp_path)
+        body = p._compose_commit_body("some-source")
+        assert "Extract entities" in body