feat(infospace): flat canonical entity set with cross-chapter deduplication

Restructure entity storage from per-chapter subdirectories to a flat canonical set in output/entities/. Each entity exists as a single file; duplicates across chapters are detected by slug collision and skipped (first occurrence wins). Chapter views use {{ include }} transclusion to reference shared entity files. Add @{existing_entities} macro to extract-entities template so the LLM knows which entities already exist and focuses on genuinely new ones. Refactor _call_llm() from _execute_llm() for callers that handle their own file I/O. 41 unique entities from 4 chapters (2 duplicates removed). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 22:24:20 +01:00
parent 706981c39f
commit 2d1282a61e
52 changed files with 1738 additions and 1376 deletions
--- a/examples/infospace-with-history/process_chapters.py
+++ b/examples/infospace-with-history/process_chapters.py
@@ -32,6 +32,7 @@ Usage:
 """

 import argparse
+import re
 import subprocess
 import sys
 from pathlib import Path
@@ -45,7 +46,8 @@ from markitect.prompts.models import Artifact, ArtifactType
 from markitect.prompts.repositories.sqlite import SQLiteArtifactRepository
 from markitect.prompts.dependencies.repository import SQLiteDependencyRepository
 from markitect.prompts.services.artifact_service import ArtifactService
-from markitect.prompts.templates.models import PromptTemplate, ContentMacro, MacroKind
+from markitect.prompts.templates.models import PromptTemplate
+from markitect.prompts.templates.analyzer import TemplateAnalyzer
 from markitect.prompts.resolver.resolver import PromptResolver
 from markitect.prompts.resolver.compiler import ContextCompiler
 from markitect.prompts.resolver.strategy import ResolutionConfig, MultiSpaceResolutionStrategy
@@ -80,6 +82,10 @@ class ChapterProcessor:
            self.artifact_repo, self.dep_repo, db_path=self.db_path
        )

+        # Template analysis and compilation
+        self.analyzer = TemplateAnalyzer()
+        self.compiler = ContextCompiler()
+
        # Information spaces
        self.spaces = {
            "templates": "infospace-templates",
@@ -92,9 +98,6 @@ class ChapterProcessor:
            "metrics": "infospace-metrics",
        }

-        # Content cache (repository stores metadata, we cache content)
-        self.artifact_content: dict[str, str] = {}
-
    # ── Artifact Management ──────────────────────────────────────────

    def load_or_create_artifact(
@@ -103,7 +106,7 @@ class ChapterProcessor:
        filepath: Path,
        artifact_type: ArtifactType,
        name: Optional[str] = None,
-    ) -> tuple[Artifact, str]:
+    ) -> Artifact:
        """Load artifact from file, create in repo if needed."""
        if name is None:
            name = filepath.stem
@@ -112,16 +115,14 @@ class ChapterProcessor:

        existing = self.artifact_repo.get_by_name(space, name)
        if existing:
-            self.artifact_content[existing.id] = content
-            return existing, content
+            return existing

        artifact = Artifact.create(
            space_id=space, name=name, content=content, artifact_type=artifact_type
        )
        artifact = self.artifact_repo.create(artifact)
-        self.artifact_content[artifact.id] = content
        print(f"  + {name} ({artifact.content_digest[:8]})")
-        return artifact, content
+        return artifact

    def store_output_artifact(
        self, space: str, name: str, content: str, artifact_type: ArtifactType
@@ -135,7 +136,6 @@ class ChapterProcessor:
            space_id=space, name=name, content=content, artifact_type=artifact_type
        )
        artifact = self.artifact_repo.create(artifact)
-        self.artifact_content[artifact.id] = content
        return artifact

    def bind_macro_artifact(self, space: str, macro_name: str, content: str) -> Artifact:
@@ -151,7 +151,6 @@ class ChapterProcessor:
            artifact_type=ArtifactType.CONTENT,
        )
        artifact = self.artifact_repo.create(artifact)
-        self.artifact_content[artifact.id] = content
        return artifact

    # ── Setup ────────────────────────────────────────────────────────
@@ -186,23 +185,16 @@ class ChapterProcessor:

        print("  Done.\n")

-    # ── Helpers ───────────────────────────────────────────────────────
-
-    @staticmethod
-    def _macro(target: str, kind: MacroKind = MacroKind.REQUIRED) -> ContentMacro:
-        """Create a ContentMacro with correct raw_text for @{target} syntax."""
-        return ContentMacro(kind=kind, target=target, raw_text=f"@{{{target}}}")
-
    # ── Template Resolution ──────────────────────────────────────────

    def resolve_and_compile(
-        self, template_name: str, macros: list[ContentMacro], extra_spaces: list[str]
+        self, template_name: str, extra_spaces: list[str]
    ) -> Optional[str]:
        """Resolve macros and compile a template into a final prompt string.

-        Uses the resolver for dependency validation, then performs content
-        substitution from our local cache (since the artifact repository
-        doesn't persist content — see resolver.py line 147).
+        Uses TemplateAnalyzer to parse @{target} macros from the template,
+        the resolver to look up artifact content, and ContextCompiler to
+        assemble the final prompt.
        """
        template_artifact = self.artifact_repo.get_by_name(
            self.spaces["templates"], template_name
@@ -212,8 +204,10 @@ class ChapterProcessor:
            return None

        template = PromptTemplate.from_artifact(template_artifact)
-        template.macros = macros
-        template.analyzed = True
+        template_content = template_artifact.content
+
+        # Analyze template to extract @{target} macros
+        self.analyzer.analyze(template, template_content)

        config = ResolutionConfig(
            space_id=self.spaces["templates"],
@@ -228,31 +222,16 @@ class ChapterProcessor:
            print(f"  ERROR: Resolution failed: {result.context.errors}")
            return None

-        # Load template content
-        template_content = self.artifact_content.get(template_artifact.id)
-        if not template_content:
-            template_content = (
-                self.example_dir / "templates" / f"{template_name}.md"
-            ).read_text()
+        # Compile template with resolved content
+        compiled = self.compiler.compile(template, template_content, result)
+        return compiled.content

-        # Substitute macros with actual content from cache
-        # (The resolver returns placeholders because the repo doesn't store content)
-        compiled_content = template_content
-        for resolved in result.context.resolved_macros:
-            if resolved.resolved and resolved.artifact:
-                actual_content = self.artifact_content.get(resolved.artifact.id, "")
-                compiled_content = compiled_content.replace(
-                    f"@{{{resolved.macro.target}}}", actual_content
-                )
+    # ── LLM Execution Helpers ─────────────────────────────────────────

-        return compiled_content
+    def _call_llm(self, prompt: str, stage_label: str) -> Optional[str]:
+        """Call the LLM and return the content string, or ``None`` on failure.

-    # ── LLM Execution Helper ────────────────────────────────────────
-
-    def _execute_llm(self, prompt: str, output_file: Path, stage_label: str) -> Optional[str]:
-        """Execute *prompt* via the configured LLM adapter and write the result.
-
-        Returns the generated content, or ``None`` on failure.
+        Does **not** write any files — callers decide where to persist.
        """
        import time as _time
        from markitect.prompts.execution.models import RunConfig
@@ -279,63 +258,254 @@ class ChapterProcessor:
            print(f"        LLM returned empty content")
            return None

-        output_file.parent.mkdir(parents=True, exist_ok=True)
-        output_file.write_text(content)
-        print(f"        LLM output written to {output_file.name}")
        return content

+    def _execute_llm(self, prompt: str, output_file: Path, stage_label: str) -> Optional[str]:
+        """Call the LLM, write the result to *output_file*, and return it."""
+        content = self._call_llm(prompt, stage_label)
+        if content:
+            output_file.parent.mkdir(parents=True, exist_ok=True)
+            output_file.write_text(content)
+            print(f"        LLM output written to {output_file.name}")
+        return content
+
+    # ── Entity Management (flat canonical set) ─────────────────────
+
+    @staticmethod
+    def _normalize_entity_name(name: str) -> str:
+        """Normalize an entity name to a kebab-case filename stem."""
+        slug = name.lower().strip()
+        slug = slug.replace("_", "-").replace(" ", "-")
+        slug = re.sub(r"[^a-z0-9-]", "", slug)
+        slug = re.sub(r"-{2,}", "-", slug)
+        return slug.strip("-")
+
+    def _entities_dir(self) -> Path:
+        return self.example_dir / "output" / "entities"
+
+    def _list_existing_entity_names(self) -> list[str]:
+        """Return sorted slugs of all canonical entity files already on disk."""
+        return sorted(
+            f.stem
+            for f in self._entities_dir().glob("*.md")
+            if not f.name.endswith("-entities.md")
+            and not f.name.endswith("-prompt.md")
+        )
+
+    def _split_entities(
+        self, combined_content: str
+    ) -> list[tuple[str, Path]]:
+        """Split combined LLM output into the flat canonical entity directory.
+
+        Writes each entity to ``output/entities/<slug>.md``.  If a file
+        with that slug already exists it is **skipped** (first-occurrence
+        wins), but the entity is still included in the returned list so
+        the chapter view can reference it.
+
+        Returns list of (entity_name, file_path) for every entity in
+        *combined_content* (new and pre-existing alike).
+        """
+        entities_dir = self._entities_dir()
+        entities_dir.mkdir(parents=True, exist_ok=True)
+
+        parts = re.split(
+            r"^---\s*ENTITY:\s*(.+?)\s*---\s*$",
+            combined_content,
+            flags=re.MULTILINE,
+        )
+
+        entity_files: list[tuple[str, Path]] = []
+        new_count = 0
+        skipped_count = 0
+
+        for i in range(1, len(parts), 2):
+            entity_name = parts[i]
+            entity_content = parts[i + 1].strip() if i + 1 < len(parts) else ""
+
+            slug = self._normalize_entity_name(entity_name)
+            if not slug:
+                continue
+
+            file_path = entities_dir / f"{slug}.md"
+            if file_path.exists():
+                skipped_count += 1
+            else:
+                file_path.write_text(entity_content + "\n")
+                new_count += 1
+
+            entity_files.append((entity_name, file_path))
+
+        msg = f"        {new_count} new entities written"
+        if skipped_count:
+            msg += f", {skipped_count} pre-existing (skipped)"
+        print(msg)
+        return entity_files
+
+    def _write_chapter_entity_view(
+        self, chapter_id: str, entity_files: list[tuple[str, Path]]
+    ) -> Path:
+        """Write a per-chapter view file that transcludes individual entities."""
+        parts = chapter_id.split("-")
+        book_num = int(parts[1]) if len(parts) >= 2 else 1
+        ch_num = int(parts[3]) if len(parts) >= 4 else 0
+        roman = {1: "I", 2: "II", 3: "III", 4: "IV", 5: "V"}.get(book_num, str(book_num))
+        title = f"# Economic Entities — Book {roman}, Chapter {ch_num}\n"
+
+        lines = [title]
+        for _name, file_path in entity_files:
+            lines.append(f'{{{{ include "{file_path.name}" }}}}')
+            lines.append("")
+            lines.append("---")
+            lines.append("")
+
+        # Remove trailing separator after last entity
+        if lines and lines[-1] == "" and len(lines) >= 3 and lines[-2] == "---":
+            lines = lines[:-2]
+
+        view_path = self._entities_dir() / f"{chapter_id}-entities.md"
+        view_path.write_text("\n".join(lines) + "\n")
+        print(f"        Chapter view written to {view_path.name}")
+        return view_path
+
+    def _read_entities_from_view(
+        self, chapter_id: str
+    ) -> tuple[str, list[tuple[str, Path]]]:
+        """Reconstruct combined entity content from a chapter view file.
+
+        Parses ``{{ include "..." }}`` directives in the view to discover
+        which canonical entity files belong to this chapter, reads them,
+        and rebuilds the delimited combined content needed by downstream
+        stages.
+        """
+        from markitect.packaging.transclusion.directives import DirectiveParser
+
+        view_path = self._entities_dir() / f"{chapter_id}-entities.md"
+        view_content = view_path.read_text()
+        includes = DirectiveParser.extract_file_includes(view_content)
+
+        entities_dir = self._entities_dir()
+        entity_files: list[tuple[str, Path]] = []
+        parts: list[str] = []
+
+        for rel_path in includes:
+            file_path = entities_dir / rel_path
+            if not file_path.exists():
+                continue
+            slug = file_path.stem
+            body = file_path.read_text().strip()
+            parts.append(f"--- ENTITY: {slug} ---\n\n{body}")
+            entity_files.append((slug, file_path))
+
+        combined = "\n\n".join(parts) + "\n" if parts else ""
+        return combined, entity_files
+
    # ── Pipeline Stages ──────────────────────────────────────────────

    def stage_extract_entities(self, chapter_id: str, chapter_content: str) -> Optional[str]:
-        """Stage 1: Extract economic entities from a chapter."""
+        """Stage 1: Extract economic entities from a chapter.
+
+        Canonical entity files live in a **flat** directory
+        (``output/entities/<slug>.md``).  Duplicates across chapters are
+        skipped — first occurrence wins.  The per-chapter view file
+        (``<chapter_id>-entities.md``) is a **secondary** transclusion view
+        that ``{{ include }}``s each entity relevant to the chapter.
+        """
        print(f"  [1/3] Extracting entities...")

        # Bind the chapter content to the macro name
        self.bind_macro_artifact(self.spaces["sources"], "chapter_text", chapter_content)

-        macros = [
-            self._macro("chapter_text"),
-            self._macro("extraction_rules"),
-            self._macro("vsm_framework"),
-        ]
+        # Bind existing entity list so the LLM knows what already exists
+        existing = self._list_existing_entity_names()
+        if existing:
+            entity_list = "\n".join(f"- {name}" for name in existing)
+        else:
+            entity_list = "(none — this is the first chapter)"
+        self.bind_macro_artifact(
+            self.spaces["entities"], "existing_entities", entity_list
+        )

        prompt = self.resolve_and_compile(
-            "extract-entities", macros, ["sources", "guidelines", "vsm-reference"]
+            "extract-entities",
+            ["sources", "guidelines", "vsm-reference", "entities"],
        )
        if not prompt:
            return None

-        # Write compiled prompt for inspection / LLM execution
-        prompt_file = self.example_dir / "output" / "entities" / f"{chapter_id}-prompt.md"
+        # Write compiled prompt for inspection
+        prompt_file = self._entities_dir() / f"{chapter_id}-prompt.md"
+        prompt_file.parent.mkdir(parents=True, exist_ok=True)
        prompt_file.write_text(prompt)
        print(f"        Prompt written to {prompt_file.relative_to(self.example_dir)}")

-        # Check for existing output (manual or LLM-generated)
-        output_file = self.example_dir / "output" / "entities" / f"{chapter_id}-entities.md"
-        if output_file.exists():
-            content = output_file.read_text()
+        view_file = self._entities_dir() / f"{chapter_id}-entities.md"
+
+        # ── PRIMARY: chapter view with transclusion already on disk ──
+        if view_file.exists() and "{{ include" in view_file.read_text():
+            content, entity_files = self._read_entities_from_view(chapter_id)
            self.store_output_artifact(
                self.spaces["entities"],
                f"{chapter_id}-entities",
                content,
                ArtifactType.GENERATED,
            )
-            print(f"        Found existing output: {output_file.name}")
+            print(f"        Found chapter view referencing {len(entity_files)} entities")
            return content

-        # Auto-generate via LLM if adapter is available
-        if self.llm_adapter and prompt:
-            content = self._execute_llm(prompt, output_file, "entities")
-            if content:
+        # ── MIGRATION: per-chapter subdirectory (previous format) ──
+        subdir = self._entities_dir() / chapter_id
+        if subdir.is_dir() and list(subdir.glob("*.md")):
+            print(f"        Migrating per-chapter subdir: {chapter_id}/")
+            entity_files: list[tuple[str, Path]] = []
+            entities_dir = self._entities_dir()
+            for src in sorted(subdir.glob("*.md")):
+                dest = entities_dir / src.name
+                if not dest.exists():
+                    src.rename(dest)
+                entity_files.append((src.stem, dest))
+            # Clean up empty subdir
+            if not list(subdir.glob("*")):
+                subdir.rmdir()
+            self._write_chapter_entity_view(chapter_id, entity_files)
+            content = self._read_entities_from_view(chapter_id)[0]
+            self.store_output_artifact(
+                self.spaces["entities"],
+                f"{chapter_id}-entities",
+                content,
+                ArtifactType.GENERATED,
+            )
+            return content
+
+        # ── MIGRATION: legacy combined file (pre-split format) ──
+        if view_file.exists():
+            raw = view_file.read_text()
+            if "--- ENTITY:" in raw:
+                print(f"        Migrating legacy combined file: {view_file.name}")
+                entity_files = self._split_entities(raw)
+                self._write_chapter_entity_view(chapter_id, entity_files)
                self.store_output_artifact(
                    self.spaces["entities"],
                    f"{chapter_id}-entities",
-                    content,
+                    raw,
                    ArtifactType.GENERATED,
                )
-                return content
+                return raw

-        print(f"        Awaiting output at: {output_file.relative_to(self.example_dir)}")
+        # ── GENERATE: call LLM, persist individual files first ──
+        if self.llm_adapter and prompt:
+            combined = self._call_llm(prompt, "entities")
+            if combined:
+                entity_files = self._split_entities(combined)
+                self._write_chapter_entity_view(chapter_id, entity_files)
+                self.store_output_artifact(
+                    self.spaces["entities"],
+                    f"{chapter_id}-entities",
+                    combined,
+                    ArtifactType.GENERATED,
+                )
+                return combined
+
+        print(f"        Awaiting entity files in: output/entities/")
        return None

    def stage_map_to_vsm(self, chapter_id: str, entities_content: str) -> Optional[str]:
@@ -344,14 +514,8 @@ class ChapterProcessor:

        self.bind_macro_artifact(self.spaces["entities"], "entities", entities_content)

-        macros = [
-            self._macro("entities"),
-            self._macro("vsm_framework"),
-            self._macro("mapping_rules"),
-        ]
-
        prompt = self.resolve_and_compile(
-            "map-to-vsm", macros, ["entities", "vsm-reference", "guidelines"]
+            "map-to-vsm", ["entities", "vsm-reference", "guidelines"]
        )
        if not prompt:
            return None
@@ -396,16 +560,8 @@ class ChapterProcessor:
        self.bind_macro_artifact(self.spaces["entities"], "entities", entities_content)
        self.bind_macro_artifact(self.spaces["mappings"], "mappings", mappings_content)

-        macros = [
-            self._macro("chapter_text"),
-            self._macro("entities"),
-            self._macro("mappings"),
-            self._macro("vsm_framework"),
-        ]
-
        prompt = self.resolve_and_compile(
            "synthesize-analysis",
-            macros,
            ["sources", "entities", "mappings", "vsm-reference"],
        )
        if not prompt:
@@ -462,13 +618,8 @@ class ChapterProcessor:

        self.bind_macro_artifact(self.spaces["analyses"], "all_analyses", combined)

-        macros = [
-            self._macro("all_analyses"),
-            self._macro("vsm_framework"),
-        ]
-
        prompt = self.resolve_and_compile(
-            "assess-metrics", macros, ["analyses", "vsm-reference"]
+            "assess-metrics", ["analyses", "vsm-reference"]
        )
        if not prompt:
            return None
@@ -615,11 +766,20 @@ class ChapterProcessor:
        print(f"  {'-'*30} {'-'*12} {'-'*12} {'-'*12}")

        for ch in chapters:
-            entities = "done" if (self.example_dir / "output" / "entities" / f"{ch}-entities.md").exists() else "-"
+            view_file = self._entities_dir() / f"{ch}-entities.md"
+            entity_count = 0
+            if view_file.exists() and "{{ include" in view_file.read_text():
+                from markitect.packaging.transclusion.directives import DirectiveParser
+                entity_count = len(DirectiveParser.extract_file_includes(view_file.read_text()))
+            entities = f"done ({entity_count})" if entity_count else "-"
            mappings = "done" if (self.example_dir / "output" / "mappings" / f"{ch}-mappings.md").exists() else "-"
            analysis = "done" if (self.example_dir / "output" / "analyses" / f"{ch}-analysis.md").exists() else "-"
            print(f"  {ch:<30} {entities:<12} {mappings:<12} {analysis:<12}")

+        total_entities = len(self._list_existing_entity_names())
+        if total_entities:
+            print(f"\n  Canonical entity set: {total_entities} unique entities")
+
    # ── Statistics ───────────────────────────────────────────────────

    def show_stats(self):