feat(infospace): flat canonical entity set with cross-chapter deduplication

Restructure entity storage from per-chapter subdirectories to a flat
canonical set in output/entities/. Each entity exists as a single file;
duplicates across chapters are detected by slug collision and skipped
(first occurrence wins). Chapter views use {{ include }} transclusion
to reference shared entity files.

Add @{existing_entities} macro to extract-entities template so the LLM
knows which entities already exist and focuses on genuinely new ones.
Refactor _call_llm() from _execute_llm() for callers that handle their
own file I/O. 41 unique entities from 4 chapters (2 duplicates removed).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-11 22:24:20 +01:00
parent 706981c39f
commit 2d1282a61e
52 changed files with 1738 additions and 1376 deletions

View File

@@ -32,6 +32,7 @@ Usage:
"""
import argparse
import re
import subprocess
import sys
from pathlib import Path
@@ -45,7 +46,8 @@ from markitect.prompts.models import Artifact, ArtifactType
from markitect.prompts.repositories.sqlite import SQLiteArtifactRepository
from markitect.prompts.dependencies.repository import SQLiteDependencyRepository
from markitect.prompts.services.artifact_service import ArtifactService
from markitect.prompts.templates.models import PromptTemplate, ContentMacro, MacroKind
from markitect.prompts.templates.models import PromptTemplate
from markitect.prompts.templates.analyzer import TemplateAnalyzer
from markitect.prompts.resolver.resolver import PromptResolver
from markitect.prompts.resolver.compiler import ContextCompiler
from markitect.prompts.resolver.strategy import ResolutionConfig, MultiSpaceResolutionStrategy
@@ -80,6 +82,10 @@ class ChapterProcessor:
self.artifact_repo, self.dep_repo, db_path=self.db_path
)
# Template analysis and compilation
self.analyzer = TemplateAnalyzer()
self.compiler = ContextCompiler()
# Information spaces
self.spaces = {
"templates": "infospace-templates",
@@ -92,9 +98,6 @@ class ChapterProcessor:
"metrics": "infospace-metrics",
}
# Content cache (repository stores metadata, we cache content)
self.artifact_content: dict[str, str] = {}
# ── Artifact Management ──────────────────────────────────────────
def load_or_create_artifact(
@@ -103,7 +106,7 @@ class ChapterProcessor:
filepath: Path,
artifact_type: ArtifactType,
name: Optional[str] = None,
) -> tuple[Artifact, str]:
) -> Artifact:
"""Load artifact from file, create in repo if needed."""
if name is None:
name = filepath.stem
@@ -112,16 +115,14 @@ class ChapterProcessor:
existing = self.artifact_repo.get_by_name(space, name)
if existing:
self.artifact_content[existing.id] = content
return existing, content
return existing
artifact = Artifact.create(
space_id=space, name=name, content=content, artifact_type=artifact_type
)
artifact = self.artifact_repo.create(artifact)
self.artifact_content[artifact.id] = content
print(f" + {name} ({artifact.content_digest[:8]})")
return artifact, content
return artifact
def store_output_artifact(
self, space: str, name: str, content: str, artifact_type: ArtifactType
@@ -135,7 +136,6 @@ class ChapterProcessor:
space_id=space, name=name, content=content, artifact_type=artifact_type
)
artifact = self.artifact_repo.create(artifact)
self.artifact_content[artifact.id] = content
return artifact
def bind_macro_artifact(self, space: str, macro_name: str, content: str) -> Artifact:
@@ -151,7 +151,6 @@ class ChapterProcessor:
artifact_type=ArtifactType.CONTENT,
)
artifact = self.artifact_repo.create(artifact)
self.artifact_content[artifact.id] = content
return artifact
# ── Setup ────────────────────────────────────────────────────────
@@ -186,23 +185,16 @@ class ChapterProcessor:
print(" Done.\n")
# ── Helpers ───────────────────────────────────────────────────────
@staticmethod
def _macro(target: str, kind: MacroKind = MacroKind.REQUIRED) -> ContentMacro:
"""Create a ContentMacro with correct raw_text for @{target} syntax."""
return ContentMacro(kind=kind, target=target, raw_text=f"@{{{target}}}")
# ── Template Resolution ──────────────────────────────────────────
def resolve_and_compile(
self, template_name: str, macros: list[ContentMacro], extra_spaces: list[str]
self, template_name: str, extra_spaces: list[str]
) -> Optional[str]:
"""Resolve macros and compile a template into a final prompt string.
Uses the resolver for dependency validation, then performs content
substitution from our local cache (since the artifact repository
doesn't persist content — see resolver.py line 147).
Uses TemplateAnalyzer to parse @{target} macros from the template,
the resolver to look up artifact content, and ContextCompiler to
assemble the final prompt.
"""
template_artifact = self.artifact_repo.get_by_name(
self.spaces["templates"], template_name
@@ -212,8 +204,10 @@ class ChapterProcessor:
return None
template = PromptTemplate.from_artifact(template_artifact)
template.macros = macros
template.analyzed = True
template_content = template_artifact.content
# Analyze template to extract @{target} macros
self.analyzer.analyze(template, template_content)
config = ResolutionConfig(
space_id=self.spaces["templates"],
@@ -228,31 +222,16 @@ class ChapterProcessor:
print(f" ERROR: Resolution failed: {result.context.errors}")
return None
# Load template content
template_content = self.artifact_content.get(template_artifact.id)
if not template_content:
template_content = (
self.example_dir / "templates" / f"{template_name}.md"
).read_text()
# Compile template with resolved content
compiled = self.compiler.compile(template, template_content, result)
return compiled.content
# Substitute macros with actual content from cache
# (The resolver returns placeholders because the repo doesn't store content)
compiled_content = template_content
for resolved in result.context.resolved_macros:
if resolved.resolved and resolved.artifact:
actual_content = self.artifact_content.get(resolved.artifact.id, "")
compiled_content = compiled_content.replace(
f"@{{{resolved.macro.target}}}", actual_content
)
# ── LLM Execution Helpers ─────────────────────────────────────────
return compiled_content
def _call_llm(self, prompt: str, stage_label: str) -> Optional[str]:
"""Call the LLM and return the content string, or ``None`` on failure.
# ── LLM Execution Helper ────────────────────────────────────────
def _execute_llm(self, prompt: str, output_file: Path, stage_label: str) -> Optional[str]:
"""Execute *prompt* via the configured LLM adapter and write the result.
Returns the generated content, or ``None`` on failure.
Does **not** write any files — callers decide where to persist.
"""
import time as _time
from markitect.prompts.execution.models import RunConfig
@@ -279,63 +258,254 @@ class ChapterProcessor:
print(f" LLM returned empty content")
return None
output_file.parent.mkdir(parents=True, exist_ok=True)
output_file.write_text(content)
print(f" LLM output written to {output_file.name}")
return content
def _execute_llm(self, prompt: str, output_file: Path, stage_label: str) -> Optional[str]:
"""Call the LLM, write the result to *output_file*, and return it."""
content = self._call_llm(prompt, stage_label)
if content:
output_file.parent.mkdir(parents=True, exist_ok=True)
output_file.write_text(content)
print(f" LLM output written to {output_file.name}")
return content
# ── Entity Management (flat canonical set) ─────────────────────
@staticmethod
def _normalize_entity_name(name: str) -> str:
"""Normalize an entity name to a kebab-case filename stem."""
slug = name.lower().strip()
slug = slug.replace("_", "-").replace(" ", "-")
slug = re.sub(r"[^a-z0-9-]", "", slug)
slug = re.sub(r"-{2,}", "-", slug)
return slug.strip("-")
def _entities_dir(self) -> Path:
return self.example_dir / "output" / "entities"
def _list_existing_entity_names(self) -> list[str]:
"""Return sorted slugs of all canonical entity files already on disk."""
return sorted(
f.stem
for f in self._entities_dir().glob("*.md")
if not f.name.endswith("-entities.md")
and not f.name.endswith("-prompt.md")
)
def _split_entities(
self, combined_content: str
) -> list[tuple[str, Path]]:
"""Split combined LLM output into the flat canonical entity directory.
Writes each entity to ``output/entities/<slug>.md``. If a file
with that slug already exists it is **skipped** (first-occurrence
wins), but the entity is still included in the returned list so
the chapter view can reference it.
Returns list of (entity_name, file_path) for every entity in
*combined_content* (new and pre-existing alike).
"""
entities_dir = self._entities_dir()
entities_dir.mkdir(parents=True, exist_ok=True)
parts = re.split(
r"^---\s*ENTITY:\s*(.+?)\s*---\s*$",
combined_content,
flags=re.MULTILINE,
)
entity_files: list[tuple[str, Path]] = []
new_count = 0
skipped_count = 0
for i in range(1, len(parts), 2):
entity_name = parts[i]
entity_content = parts[i + 1].strip() if i + 1 < len(parts) else ""
slug = self._normalize_entity_name(entity_name)
if not slug:
continue
file_path = entities_dir / f"{slug}.md"
if file_path.exists():
skipped_count += 1
else:
file_path.write_text(entity_content + "\n")
new_count += 1
entity_files.append((entity_name, file_path))
msg = f" {new_count} new entities written"
if skipped_count:
msg += f", {skipped_count} pre-existing (skipped)"
print(msg)
return entity_files
def _write_chapter_entity_view(
self, chapter_id: str, entity_files: list[tuple[str, Path]]
) -> Path:
"""Write a per-chapter view file that transcludes individual entities."""
parts = chapter_id.split("-")
book_num = int(parts[1]) if len(parts) >= 2 else 1
ch_num = int(parts[3]) if len(parts) >= 4 else 0
roman = {1: "I", 2: "II", 3: "III", 4: "IV", 5: "V"}.get(book_num, str(book_num))
title = f"# Economic Entities — Book {roman}, Chapter {ch_num}\n"
lines = [title]
for _name, file_path in entity_files:
lines.append(f'{{{{ include "{file_path.name}" }}}}')
lines.append("")
lines.append("---")
lines.append("")
# Remove trailing separator after last entity
if lines and lines[-1] == "" and len(lines) >= 3 and lines[-2] == "---":
lines = lines[:-2]
view_path = self._entities_dir() / f"{chapter_id}-entities.md"
view_path.write_text("\n".join(lines) + "\n")
print(f" Chapter view written to {view_path.name}")
return view_path
def _read_entities_from_view(
self, chapter_id: str
) -> tuple[str, list[tuple[str, Path]]]:
"""Reconstruct combined entity content from a chapter view file.
Parses ``{{ include "..." }}`` directives in the view to discover
which canonical entity files belong to this chapter, reads them,
and rebuilds the delimited combined content needed by downstream
stages.
"""
from markitect.packaging.transclusion.directives import DirectiveParser
view_path = self._entities_dir() / f"{chapter_id}-entities.md"
view_content = view_path.read_text()
includes = DirectiveParser.extract_file_includes(view_content)
entities_dir = self._entities_dir()
entity_files: list[tuple[str, Path]] = []
parts: list[str] = []
for rel_path in includes:
file_path = entities_dir / rel_path
if not file_path.exists():
continue
slug = file_path.stem
body = file_path.read_text().strip()
parts.append(f"--- ENTITY: {slug} ---\n\n{body}")
entity_files.append((slug, file_path))
combined = "\n\n".join(parts) + "\n" if parts else ""
return combined, entity_files
# ── Pipeline Stages ──────────────────────────────────────────────
def stage_extract_entities(self, chapter_id: str, chapter_content: str) -> Optional[str]:
"""Stage 1: Extract economic entities from a chapter."""
"""Stage 1: Extract economic entities from a chapter.
Canonical entity files live in a **flat** directory
(``output/entities/<slug>.md``). Duplicates across chapters are
skipped — first occurrence wins. The per-chapter view file
(``<chapter_id>-entities.md``) is a **secondary** transclusion view
that ``{{ include }}``s each entity relevant to the chapter.
"""
print(f" [1/3] Extracting entities...")
# Bind the chapter content to the macro name
self.bind_macro_artifact(self.spaces["sources"], "chapter_text", chapter_content)
macros = [
self._macro("chapter_text"),
self._macro("extraction_rules"),
self._macro("vsm_framework"),
]
# Bind existing entity list so the LLM knows what already exists
existing = self._list_existing_entity_names()
if existing:
entity_list = "\n".join(f"- {name}" for name in existing)
else:
entity_list = "(none — this is the first chapter)"
self.bind_macro_artifact(
self.spaces["entities"], "existing_entities", entity_list
)
prompt = self.resolve_and_compile(
"extract-entities", macros, ["sources", "guidelines", "vsm-reference"]
"extract-entities",
["sources", "guidelines", "vsm-reference", "entities"],
)
if not prompt:
return None
# Write compiled prompt for inspection / LLM execution
prompt_file = self.example_dir / "output" / "entities" / f"{chapter_id}-prompt.md"
# Write compiled prompt for inspection
prompt_file = self._entities_dir() / f"{chapter_id}-prompt.md"
prompt_file.parent.mkdir(parents=True, exist_ok=True)
prompt_file.write_text(prompt)
print(f" Prompt written to {prompt_file.relative_to(self.example_dir)}")
# Check for existing output (manual or LLM-generated)
output_file = self.example_dir / "output" / "entities" / f"{chapter_id}-entities.md"
if output_file.exists():
content = output_file.read_text()
view_file = self._entities_dir() / f"{chapter_id}-entities.md"
# ── PRIMARY: chapter view with transclusion already on disk ──
if view_file.exists() and "{{ include" in view_file.read_text():
content, entity_files = self._read_entities_from_view(chapter_id)
self.store_output_artifact(
self.spaces["entities"],
f"{chapter_id}-entities",
content,
ArtifactType.GENERATED,
)
print(f" Found existing output: {output_file.name}")
print(f" Found chapter view referencing {len(entity_files)} entities")
return content
# Auto-generate via LLM if adapter is available
if self.llm_adapter and prompt:
content = self._execute_llm(prompt, output_file, "entities")
if content:
# ── MIGRATION: per-chapter subdirectory (previous format) ──
subdir = self._entities_dir() / chapter_id
if subdir.is_dir() and list(subdir.glob("*.md")):
print(f" Migrating per-chapter subdir: {chapter_id}/")
entity_files: list[tuple[str, Path]] = []
entities_dir = self._entities_dir()
for src in sorted(subdir.glob("*.md")):
dest = entities_dir / src.name
if not dest.exists():
src.rename(dest)
entity_files.append((src.stem, dest))
# Clean up empty subdir
if not list(subdir.glob("*")):
subdir.rmdir()
self._write_chapter_entity_view(chapter_id, entity_files)
content = self._read_entities_from_view(chapter_id)[0]
self.store_output_artifact(
self.spaces["entities"],
f"{chapter_id}-entities",
content,
ArtifactType.GENERATED,
)
return content
# ── MIGRATION: legacy combined file (pre-split format) ──
if view_file.exists():
raw = view_file.read_text()
if "--- ENTITY:" in raw:
print(f" Migrating legacy combined file: {view_file.name}")
entity_files = self._split_entities(raw)
self._write_chapter_entity_view(chapter_id, entity_files)
self.store_output_artifact(
self.spaces["entities"],
f"{chapter_id}-entities",
content,
raw,
ArtifactType.GENERATED,
)
return content
return raw
print(f" Awaiting output at: {output_file.relative_to(self.example_dir)}")
# ── GENERATE: call LLM, persist individual files first ──
if self.llm_adapter and prompt:
combined = self._call_llm(prompt, "entities")
if combined:
entity_files = self._split_entities(combined)
self._write_chapter_entity_view(chapter_id, entity_files)
self.store_output_artifact(
self.spaces["entities"],
f"{chapter_id}-entities",
combined,
ArtifactType.GENERATED,
)
return combined
print(f" Awaiting entity files in: output/entities/")
return None
def stage_map_to_vsm(self, chapter_id: str, entities_content: str) -> Optional[str]:
@@ -344,14 +514,8 @@ class ChapterProcessor:
self.bind_macro_artifact(self.spaces["entities"], "entities", entities_content)
macros = [
self._macro("entities"),
self._macro("vsm_framework"),
self._macro("mapping_rules"),
]
prompt = self.resolve_and_compile(
"map-to-vsm", macros, ["entities", "vsm-reference", "guidelines"]
"map-to-vsm", ["entities", "vsm-reference", "guidelines"]
)
if not prompt:
return None
@@ -396,16 +560,8 @@ class ChapterProcessor:
self.bind_macro_artifact(self.spaces["entities"], "entities", entities_content)
self.bind_macro_artifact(self.spaces["mappings"], "mappings", mappings_content)
macros = [
self._macro("chapter_text"),
self._macro("entities"),
self._macro("mappings"),
self._macro("vsm_framework"),
]
prompt = self.resolve_and_compile(
"synthesize-analysis",
macros,
["sources", "entities", "mappings", "vsm-reference"],
)
if not prompt:
@@ -462,13 +618,8 @@ class ChapterProcessor:
self.bind_macro_artifact(self.spaces["analyses"], "all_analyses", combined)
macros = [
self._macro("all_analyses"),
self._macro("vsm_framework"),
]
prompt = self.resolve_and_compile(
"assess-metrics", macros, ["analyses", "vsm-reference"]
"assess-metrics", ["analyses", "vsm-reference"]
)
if not prompt:
return None
@@ -615,11 +766,20 @@ class ChapterProcessor:
print(f" {'-'*30} {'-'*12} {'-'*12} {'-'*12}")
for ch in chapters:
entities = "done" if (self.example_dir / "output" / "entities" / f"{ch}-entities.md").exists() else "-"
view_file = self._entities_dir() / f"{ch}-entities.md"
entity_count = 0
if view_file.exists() and "{{ include" in view_file.read_text():
from markitect.packaging.transclusion.directives import DirectiveParser
entity_count = len(DirectiveParser.extract_file_includes(view_file.read_text()))
entities = f"done ({entity_count})" if entity_count else "-"
mappings = "done" if (self.example_dir / "output" / "mappings" / f"{ch}-mappings.md").exists() else "-"
analysis = "done" if (self.example_dir / "output" / "analyses" / f"{ch}-analysis.md").exists() else "-"
print(f" {ch:<30} {entities:<12} {mappings:<12} {analysis:<12}")
total_entities = len(self._list_existing_entity_names())
if total_entities:
print(f"\n Canonical entity set: {total_entities} unique entities")
# ── Statistics ───────────────────────────────────────────────────
def show_stats(self):