feat(infospace): systematic long-text processing — rich commit bodies, per-source eval/classify, chapters view
Some checks failed
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
Some checks failed
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
Three coordinated changes that let the pipeline produce a clean chapter-by-chapter git history on long texts without archaeology after the fact. 1. Richer commit messages. `SourcePipeline._git_commit` now diffs the staged changes, buckets added files by output subdirectory (entities, evaluations, classifications, mappings, analyses, metrics, logs), and includes counts in the commit body. So `git log` reads "entities: +23, evaluations: +23" per chapter instead of the same generic blurb on every commit. Zero behaviour change when no output changed; falls back to the original message if the diff query fails. 2. --eval-after-source / --classify-after-source on `infospace process`. After a source's stages succeed, the pipeline identifies which entity files are *new* (set diff of entity slugs before vs after), loads their EntityMeta, and runs per-entity evaluation and/or classification scoped to just those slugs before the per-source git commit lands. Result: each chapter's commit is self-contained — extraction + evaluation + classification in one atomic unit. Gated behind explicit flags because the cost is real (LLM latency per chapter rather than amortised across one bulk batch). 3. `markitect infospace chapters` subcommand. Lists source files in canonical order with entity count, evaluated count, classified count, and mean per-entity score per source. Text or JSON output. Natural triage surface for long-text infospaces — spot chapters that under-extracted or evaluated poorly. Also: `docs/advanced-usage.md` gets a new "Systematic processing of long texts" section with the recommended flag combo and the tradeoff note on cost. 11 new unit tests cover the chapters command (text/json/no-sources), the process flag wiring (help + provider requirement), and the commit-body bucket logic. Full infospace+llm unit suite (315 tests) green; 3 pre-existing infospace failures unchanged. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -223,3 +223,129 @@ class TestViabilityCommand:
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "No viability thresholds" in result.output
|
||||
|
||||
|
||||
# ── chapters (per-source triage view) ────────────────────────────────
|
||||
|
||||
|
||||
class TestChaptersCommand:
|
||||
@pytest.fixture
|
||||
def chapters_dir(self, tmp_path):
|
||||
"""Infospace with 2 source files and matching entities."""
|
||||
config_yaml = """\
|
||||
topic:
|
||||
name: "WoN"
|
||||
domain: "Economics"
|
||||
sources: artifacts/sources
|
||||
"""
|
||||
(tmp_path / "infospace.yaml").write_text(config_yaml)
|
||||
|
||||
sources = tmp_path / "artifacts" / "sources"
|
||||
sources.mkdir(parents=True)
|
||||
(sources / "book-1-chapter-01.md").write_text("# Chapter 1\n\nText.\n")
|
||||
(sources / "book-1-chapter-02.md").write_text("# Chapter 2\n\nText.\n")
|
||||
|
||||
entities = tmp_path / "output" / "entities"
|
||||
entities.mkdir(parents=True)
|
||||
(entities / "alpha.md").write_text(
|
||||
"# Alpha\n\n## Definition\n\nX.\n\n"
|
||||
"## Source Chapter\n\nBook I, Chapter 1\n"
|
||||
)
|
||||
(entities / "beta.md").write_text(
|
||||
"# Beta\n\n## Definition\n\nY.\n\n"
|
||||
"## Source Chapter\n\nBook I, Chapter 2\n"
|
||||
)
|
||||
(entities / "gamma.md").write_text(
|
||||
"# Gamma\n\n## Definition\n\nZ.\n\n"
|
||||
"## Source Chapter\n\nBook I, Chapter 2\n"
|
||||
)
|
||||
return tmp_path
|
||||
|
||||
def test_lists_sources_with_counts(self, runner, chapters_dir):
|
||||
result = runner.invoke(
|
||||
infospace_commands,
|
||||
["chapters", "--config", str(chapters_dir / "infospace.yaml")],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "book-1-chapter-01" in result.output
|
||||
assert "book-1-chapter-02" in result.output
|
||||
# ch 1 -> 1 entity, ch 2 -> 2 entities
|
||||
assert "2 source file(s); 3 entities" in result.output
|
||||
|
||||
def test_json_format(self, runner, chapters_dir):
|
||||
result = runner.invoke(
|
||||
infospace_commands,
|
||||
["chapters", "--config", str(chapters_dir / "infospace.yaml"),
|
||||
"--format", "json"],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
import json
|
||||
rows = json.loads(result.output)
|
||||
by_id = {r["source_id"]: r for r in rows}
|
||||
assert by_id["book-1-chapter-01"]["entities"] == 1
|
||||
assert by_id["book-1-chapter-02"]["entities"] == 2
|
||||
|
||||
def test_no_sources_dir(self, runner, tmp_path):
|
||||
(tmp_path / "infospace.yaml").write_text(
|
||||
"topic:\n name: X\n sources: missing\n"
|
||||
)
|
||||
result = runner.invoke(
|
||||
infospace_commands,
|
||||
["chapters", "--config", str(tmp_path / "infospace.yaml")],
|
||||
)
|
||||
assert result.exit_code == 1
|
||||
|
||||
|
||||
# ── process: eval-after-source / classify-after-source flags ─────────
|
||||
|
||||
|
||||
class TestProcessAfterSourceFlags:
|
||||
def test_flags_registered_in_help(self, runner):
|
||||
result = runner.invoke(infospace_commands, ["process", "--help"])
|
||||
assert result.exit_code == 0
|
||||
assert "--eval-after-source" in result.output
|
||||
assert "--classify-after-source" in result.output
|
||||
|
||||
def test_flags_require_provider(self, runner, tmp_path):
|
||||
(tmp_path / "infospace.yaml").write_text(
|
||||
"topic:\n name: X\n sources: sources\n"
|
||||
"pipeline:\n stages:\n - template: extract-entities\n"
|
||||
)
|
||||
sources = tmp_path / "sources"
|
||||
sources.mkdir()
|
||||
(sources / "s1.md").write_text("source")
|
||||
result = runner.invoke(
|
||||
infospace_commands,
|
||||
["process", "--all",
|
||||
"--config", str(tmp_path / "infospace.yaml"),
|
||||
"--eval-after-source"],
|
||||
)
|
||||
assert result.exit_code == 1
|
||||
assert "require --provider" in result.output
|
||||
|
||||
|
||||
# ── pipeline: commit body composition ────────────────────────────────
|
||||
|
||||
|
||||
class TestCommitBodyComposition:
|
||||
def test_bucket_for(self, tmp_path):
|
||||
from markitect.infospace.config import InfospaceConfig, TopicConfig
|
||||
from markitect.infospace.pipeline import SourcePipeline
|
||||
cfg = InfospaceConfig(topic=TopicConfig(name="T", domain="D"))
|
||||
p = SourcePipeline(cfg, tmp_path)
|
||||
assert p._bucket_for("output/entities/x.md") == "entities"
|
||||
assert p._bucket_for("output/evaluations/x.md") == "evaluations"
|
||||
assert p._bucket_for("output/classifications/x.md") == "classifications"
|
||||
assert p._bucket_for("output/mappings/x.md") == "mappings"
|
||||
assert p._bucket_for("output/notes/x.md") == "other"
|
||||
assert p._bucket_for("README.md") is None # not under output/
|
||||
|
||||
def test_compose_body_uses_default_on_no_diff(self, tmp_path):
|
||||
"""When git diff fails or returns empty, fall back to the default blurb."""
|
||||
from markitect.infospace.config import InfospaceConfig, TopicConfig
|
||||
from markitect.infospace.pipeline import SourcePipeline
|
||||
cfg = InfospaceConfig(topic=TopicConfig(name="T", domain="D"))
|
||||
# Not a git repo, so `git diff --cached` will raise CalledProcessError.
|
||||
p = SourcePipeline(cfg, tmp_path)
|
||||
body = p._compose_commit_body("some-source")
|
||||
assert "Extract entities" in body
|
||||
|
||||
Reference in New Issue
Block a user