feat(infospace): systematic long-text processing — rich commit bodies, per-source eval/classify, chapters view

Three coordinated changes that let the pipeline produce a clean chapter-by-chapter git history on long texts without archaeology after the fact. 1. Richer commit messages. `SourcePipeline._git_commit` now diffs the staged changes, buckets added files by output subdirectory (entities, evaluations, classifications, mappings, analyses, metrics, logs), and includes counts in the commit body. So `git log` reads "entities: +23, evaluations: +23" per chapter instead of the same generic blurb on every commit. Zero behaviour change when no output changed; falls back to the original message if the diff query fails. 2. --eval-after-source / --classify-after-source on `infospace process`. After a source's stages succeed, the pipeline identifies which entity files are *new* (set diff of entity slugs before vs after), loads their EntityMeta, and runs per-entity evaluation and/or classification scoped to just those slugs before the per-source git commit lands. Result: each chapter's commit is self-contained — extraction + evaluation + classification in one atomic unit. Gated behind explicit flags because the cost is real (LLM latency per chapter rather than amortised across one bulk batch). 3. `markitect infospace chapters` subcommand. Lists source files in canonical order with entity count, evaluated count, classified count, and mean per-entity score per source. Text or JSON output. Natural triage surface for long-text infospaces — spot chapters that under-extracted or evaluated poorly. Also: `docs/advanced-usage.md` gets a new "Systematic processing of long texts" section with the recommended flag combo and the tradeoff note on cost. 11 new unit tests cover the chapters command (text/json/no-sources), the process flag wiring (help + provider requirement), and the commit-body bucket logic. Full infospace+llm unit suite (315 tests) green; 3 pre-existing infospace failures unchanged. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-22 08:24:26 +02:00
parent 9e8d73fa7d
commit e3e5b8ecc1
4 changed files with 501 additions and 4 deletions
--- a/markitect/infospace/pipeline.py
+++ b/markitect/infospace/pipeline.py
@@ -62,6 +62,8 @@ class SourcePipeline:
        provider: str = "",
        model: str = "",
        no_commit: bool = False,
+        eval_after_source: bool = False,
+        classify_after_source: bool = False,
    ) -> None:
        self.config = config
        self.root = root
@@ -69,6 +71,8 @@ class SourcePipeline:
        self.provider = provider
        self.model = model
        self.no_commit = no_commit
+        self.eval_after_source = eval_after_source
+        self.classify_after_source = classify_after_source

    # ── Public API ────────────────────────────────────────────────────

@@ -110,6 +114,12 @@ class SourcePipeline:
        stage_outputs: Dict[str, str] = {}
        stage_logs: List[Dict[str, Any]] = []

+        # Snapshot entity slugs before any stage runs so we can identify
+        # which entities were newly produced by this source. Used to scope
+        # --eval-after-source / --classify-after-source to only the new
+        # entities.
+        pre_entity_slugs = self._current_entity_slugs()
+
        print(f"\nProcessing: {source_id}")
        print("=" * 60)

@@ -133,6 +143,14 @@ class SourcePipeline:

        print(f"\n  {source_id}: all stages complete.")
        self._write_processing_log(source_id, stage_logs, success=True)
+
+        # Per-source follow-ups: evaluate and/or classify just the new
+        # entities this source produced, so the next commit contains a
+        # fully-processed chapter.
+        new_slugs = self._current_entity_slugs() - pre_entity_slugs
+        if new_slugs and (self.eval_after_source or self.classify_after_source):
+            self._run_per_source_followups(new_slugs)
+
        if not self.no_commit:
            self._git_commit(source_id)

@@ -636,7 +654,13 @@ class SourcePipeline:
    # ── Git Integration ───────────────────────────────────────────────

    def _git_commit(self, source_id: str) -> None:
-        """Stage all output changes and commit them for *source_id*."""
+        """Stage all output changes and commit them for *source_id*.
+
+        The commit message body summarises what actually changed — counts
+        of entities / evaluations / classifications / analyses added — so
+        ``git log`` reads like the chapter-by-chapter story of the
+        infospace growing, not a wall of identical messages.
+        """
        output_dir = self.root / "output"
        try:
            subprocess.run(
@@ -645,11 +669,11 @@ class SourcePipeline:
                check=True,
                capture_output=True,
            )
+            body = self._compose_commit_body(source_id)
            result = subprocess.run(
                [
                    "git", "commit", "-m",
-                    f"infospace: process {source_id}\n\n"
-                    f"Extract entities, map to VSM, and synthesize analysis.",
+                    f"infospace: process {source_id}\n\n{body}",
                ],
                cwd=str(self.root),
                capture_output=True,
@@ -666,3 +690,146 @@ class SourcePipeline:
        except subprocess.CalledProcessError as e:
            stderr = e.stderr.decode() if isinstance(e.stderr, bytes) else (e.stderr or "")
            print(f"  Warning: Git error: {stderr.strip()}")
+
+    # ── Per-source helpers ────────────────────────────────────────────
+
+    def _current_entity_slugs(self) -> set:
+        """Return the set of entity file stems currently on disk."""
+        entities_dir = self.root / self.config.entities_dir
+        if not entities_dir.is_dir():
+            return set()
+        return {p.stem for p in entities_dir.glob("*.md")}
+
+    def _run_per_source_followups(self, new_slugs: set) -> None:
+        """Run per-source evaluation and/or classification on *new_slugs*.
+
+        Called after a source's pipeline stages succeed, before the git
+        commit, so each chapter's commit contains the full set of
+        artefacts derived from it.
+        """
+        from markitect.infospace.entity_parser import parse_entity_directory
+
+        entities_dir = self.root / self.config.entities_dir
+        all_entities = parse_entity_directory(entities_dir)
+        new_entities = [e for e in all_entities if e.slug in new_slugs]
+        if not new_entities:
+            return
+
+        if self.adapter is None:
+            print(
+                "  Skipping per-source eval/classify: no LLM adapter "
+                "configured (run with --provider)."
+            )
+            return
+
+        from markitect.prompts.execution.models import RunConfig
+
+        run_config = RunConfig(
+            model_name=self.model or None, temperature=0.3, max_tokens=2000
+        )
+
+        if self.eval_after_source:
+            from markitect.infospace.evaluate import run_entity_evaluation
+
+            print(f"  Evaluating {len(new_entities)} new entity/entities…")
+            try:
+                run_entity_evaluation(
+                    config=self.config,
+                    entities=new_entities,
+                    adapter=self.adapter,
+                    run_config=run_config,
+                    output_dir=self.root / self.config.evaluations_dir,
+                )
+            except Exception as exc:
+                print(f"  Warning: per-source evaluation failed: {exc}")
+
+        if self.classify_after_source:
+            from markitect.infospace.classifier import run_entity_classification
+
+            print(f"  Classifying {len(new_entities)} new entity/entities…")
+            try:
+                run_entity_classification(
+                    config=self.config,
+                    entities=new_entities,
+                    adapter=self.adapter,
+                    run_config=run_config,
+                    output_dir=self.root / self.config.classifications_dir,
+                )
+            except Exception as exc:
+                print(f"  Warning: per-source classification failed: {exc}")
+
+    def _compose_commit_body(self, source_id: str) -> str:
+        """Summarise staged output changes into a commit-message body.
+
+        Counts added files per output subdirectory (entities, evaluations,
+        classifications, analyses, mappings…) and produces one line per
+        bucket that actually saw additions. Modified/deleted files are
+        noted separately for auditability.
+        """
+        default = "Extract entities, map to VSM, and synthesize analysis."
+        try:
+            result = subprocess.run(
+                ["git", "diff", "--cached", "--name-status", "--", "output"],
+                cwd=str(self.root),
+                check=True,
+                capture_output=True,
+                text=True,
+            )
+        except subprocess.CalledProcessError:
+            return default
+
+        added_by_bucket: Dict[str, int] = {}
+        modified = 0
+        deleted = 0
+        for line in result.stdout.splitlines():
+            parts = line.split("\t")
+            if len(parts) < 2:
+                continue
+            status = parts[0]
+            path = parts[-1]
+            if status.startswith("A"):
+                bucket = self._bucket_for(path)
+                if bucket:
+                    added_by_bucket[bucket] = added_by_bucket.get(bucket, 0) + 1
+            elif status.startswith("M"):
+                modified += 1
+            elif status.startswith("D"):
+                deleted += 1
+
+        if not added_by_bucket and not modified and not deleted:
+            return default
+
+        # Emit buckets in a deterministic, reader-friendly order.
+        order = ["entities", "mappings", "analyses", "evaluations",
+                 "classifications", "metrics", "logs", "other"]
+        lines: List[str] = []
+        for bucket in order:
+            n = added_by_bucket.get(bucket, 0)
+            if n:
+                lines.append(f"- {bucket}: +{n}")
+        if modified:
+            lines.append(f"- modified: {modified}")
+        if deleted:
+            lines.append(f"- deleted: {deleted}")
+        return "\n".join(lines) if lines else default
+
+    def _bucket_for(self, path: str) -> Optional[str]:
+        """Map an ``output/...`` path to a commit-summary bucket name."""
+        # Use configured directory basenames where possible so non-default
+        # layouts still bucket correctly.
+        buckets = {
+            Path(self.config.entities_dir).name: "entities",
+            Path(self.config.evaluations_dir).name: "evaluations",
+            Path(self.config.classifications_dir).name: "classifications",
+        }
+        parts = Path(path).parts
+        if len(parts) < 2 or parts[0] != "output":
+            return None
+        sub = parts[1]
+        if sub in buckets:
+            return buckets[sub]
+        # Heuristic fallback for common additional output subdirectories.
+        known = {"mappings", "analyses", "metrics", "logs"}
+        if sub in known:
+            return sub
+        return "other"