feat(infospace): flat canonical entity set with cross-chapter deduplication
Restructure entity storage from per-chapter subdirectories to a flat
canonical set in output/entities/. Each entity exists as a single file;
duplicates across chapters are detected by slug collision and skipped
(first occurrence wins). Chapter views use {{ include }} transclusion
to reference shared entity files.
Add @{existing_entities} macro to extract-entities template so the LLM
knows which entities already exist and focuses on genuinely new ones.
Refactor _call_llm() from _execute_llm() for callers that handle their
own file I/O. 41 unique entities from 4 chapters (2 duplicates removed).
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -32,6 +32,7 @@ Usage:
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
@@ -45,7 +46,8 @@ from markitect.prompts.models import Artifact, ArtifactType
|
||||
from markitect.prompts.repositories.sqlite import SQLiteArtifactRepository
|
||||
from markitect.prompts.dependencies.repository import SQLiteDependencyRepository
|
||||
from markitect.prompts.services.artifact_service import ArtifactService
|
||||
from markitect.prompts.templates.models import PromptTemplate, ContentMacro, MacroKind
|
||||
from markitect.prompts.templates.models import PromptTemplate
|
||||
from markitect.prompts.templates.analyzer import TemplateAnalyzer
|
||||
from markitect.prompts.resolver.resolver import PromptResolver
|
||||
from markitect.prompts.resolver.compiler import ContextCompiler
|
||||
from markitect.prompts.resolver.strategy import ResolutionConfig, MultiSpaceResolutionStrategy
|
||||
@@ -80,6 +82,10 @@ class ChapterProcessor:
|
||||
self.artifact_repo, self.dep_repo, db_path=self.db_path
|
||||
)
|
||||
|
||||
# Template analysis and compilation
|
||||
self.analyzer = TemplateAnalyzer()
|
||||
self.compiler = ContextCompiler()
|
||||
|
||||
# Information spaces
|
||||
self.spaces = {
|
||||
"templates": "infospace-templates",
|
||||
@@ -92,9 +98,6 @@ class ChapterProcessor:
|
||||
"metrics": "infospace-metrics",
|
||||
}
|
||||
|
||||
# Content cache (repository stores metadata, we cache content)
|
||||
self.artifact_content: dict[str, str] = {}
|
||||
|
||||
# ── Artifact Management ──────────────────────────────────────────
|
||||
|
||||
def load_or_create_artifact(
|
||||
@@ -103,7 +106,7 @@ class ChapterProcessor:
|
||||
filepath: Path,
|
||||
artifact_type: ArtifactType,
|
||||
name: Optional[str] = None,
|
||||
) -> tuple[Artifact, str]:
|
||||
) -> Artifact:
|
||||
"""Load artifact from file, create in repo if needed."""
|
||||
if name is None:
|
||||
name = filepath.stem
|
||||
@@ -112,16 +115,14 @@ class ChapterProcessor:
|
||||
|
||||
existing = self.artifact_repo.get_by_name(space, name)
|
||||
if existing:
|
||||
self.artifact_content[existing.id] = content
|
||||
return existing, content
|
||||
return existing
|
||||
|
||||
artifact = Artifact.create(
|
||||
space_id=space, name=name, content=content, artifact_type=artifact_type
|
||||
)
|
||||
artifact = self.artifact_repo.create(artifact)
|
||||
self.artifact_content[artifact.id] = content
|
||||
print(f" + {name} ({artifact.content_digest[:8]})")
|
||||
return artifact, content
|
||||
return artifact
|
||||
|
||||
def store_output_artifact(
|
||||
self, space: str, name: str, content: str, artifact_type: ArtifactType
|
||||
@@ -135,7 +136,6 @@ class ChapterProcessor:
|
||||
space_id=space, name=name, content=content, artifact_type=artifact_type
|
||||
)
|
||||
artifact = self.artifact_repo.create(artifact)
|
||||
self.artifact_content[artifact.id] = content
|
||||
return artifact
|
||||
|
||||
def bind_macro_artifact(self, space: str, macro_name: str, content: str) -> Artifact:
|
||||
@@ -151,7 +151,6 @@ class ChapterProcessor:
|
||||
artifact_type=ArtifactType.CONTENT,
|
||||
)
|
||||
artifact = self.artifact_repo.create(artifact)
|
||||
self.artifact_content[artifact.id] = content
|
||||
return artifact
|
||||
|
||||
# ── Setup ────────────────────────────────────────────────────────
|
||||
@@ -186,23 +185,16 @@ class ChapterProcessor:
|
||||
|
||||
print(" Done.\n")
|
||||
|
||||
# ── Helpers ───────────────────────────────────────────────────────
|
||||
|
||||
@staticmethod
|
||||
def _macro(target: str, kind: MacroKind = MacroKind.REQUIRED) -> ContentMacro:
|
||||
"""Create a ContentMacro with correct raw_text for @{target} syntax."""
|
||||
return ContentMacro(kind=kind, target=target, raw_text=f"@{{{target}}}")
|
||||
|
||||
# ── Template Resolution ──────────────────────────────────────────
|
||||
|
||||
def resolve_and_compile(
|
||||
self, template_name: str, macros: list[ContentMacro], extra_spaces: list[str]
|
||||
self, template_name: str, extra_spaces: list[str]
|
||||
) -> Optional[str]:
|
||||
"""Resolve macros and compile a template into a final prompt string.
|
||||
|
||||
Uses the resolver for dependency validation, then performs content
|
||||
substitution from our local cache (since the artifact repository
|
||||
doesn't persist content — see resolver.py line 147).
|
||||
Uses TemplateAnalyzer to parse @{target} macros from the template,
|
||||
the resolver to look up artifact content, and ContextCompiler to
|
||||
assemble the final prompt.
|
||||
"""
|
||||
template_artifact = self.artifact_repo.get_by_name(
|
||||
self.spaces["templates"], template_name
|
||||
@@ -212,8 +204,10 @@ class ChapterProcessor:
|
||||
return None
|
||||
|
||||
template = PromptTemplate.from_artifact(template_artifact)
|
||||
template.macros = macros
|
||||
template.analyzed = True
|
||||
template_content = template_artifact.content
|
||||
|
||||
# Analyze template to extract @{target} macros
|
||||
self.analyzer.analyze(template, template_content)
|
||||
|
||||
config = ResolutionConfig(
|
||||
space_id=self.spaces["templates"],
|
||||
@@ -228,31 +222,16 @@ class ChapterProcessor:
|
||||
print(f" ERROR: Resolution failed: {result.context.errors}")
|
||||
return None
|
||||
|
||||
# Load template content
|
||||
template_content = self.artifact_content.get(template_artifact.id)
|
||||
if not template_content:
|
||||
template_content = (
|
||||
self.example_dir / "templates" / f"{template_name}.md"
|
||||
).read_text()
|
||||
# Compile template with resolved content
|
||||
compiled = self.compiler.compile(template, template_content, result)
|
||||
return compiled.content
|
||||
|
||||
# Substitute macros with actual content from cache
|
||||
# (The resolver returns placeholders because the repo doesn't store content)
|
||||
compiled_content = template_content
|
||||
for resolved in result.context.resolved_macros:
|
||||
if resolved.resolved and resolved.artifact:
|
||||
actual_content = self.artifact_content.get(resolved.artifact.id, "")
|
||||
compiled_content = compiled_content.replace(
|
||||
f"@{{{resolved.macro.target}}}", actual_content
|
||||
)
|
||||
# ── LLM Execution Helpers ─────────────────────────────────────────
|
||||
|
||||
return compiled_content
|
||||
def _call_llm(self, prompt: str, stage_label: str) -> Optional[str]:
|
||||
"""Call the LLM and return the content string, or ``None`` on failure.
|
||||
|
||||
# ── LLM Execution Helper ────────────────────────────────────────
|
||||
|
||||
def _execute_llm(self, prompt: str, output_file: Path, stage_label: str) -> Optional[str]:
|
||||
"""Execute *prompt* via the configured LLM adapter and write the result.
|
||||
|
||||
Returns the generated content, or ``None`` on failure.
|
||||
Does **not** write any files — callers decide where to persist.
|
||||
"""
|
||||
import time as _time
|
||||
from markitect.prompts.execution.models import RunConfig
|
||||
@@ -279,63 +258,254 @@ class ChapterProcessor:
|
||||
print(f" LLM returned empty content")
|
||||
return None
|
||||
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_file.write_text(content)
|
||||
print(f" LLM output written to {output_file.name}")
|
||||
return content
|
||||
|
||||
def _execute_llm(self, prompt: str, output_file: Path, stage_label: str) -> Optional[str]:
|
||||
"""Call the LLM, write the result to *output_file*, and return it."""
|
||||
content = self._call_llm(prompt, stage_label)
|
||||
if content:
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_file.write_text(content)
|
||||
print(f" LLM output written to {output_file.name}")
|
||||
return content
|
||||
|
||||
# ── Entity Management (flat canonical set) ─────────────────────
|
||||
|
||||
@staticmethod
|
||||
def _normalize_entity_name(name: str) -> str:
|
||||
"""Normalize an entity name to a kebab-case filename stem."""
|
||||
slug = name.lower().strip()
|
||||
slug = slug.replace("_", "-").replace(" ", "-")
|
||||
slug = re.sub(r"[^a-z0-9-]", "", slug)
|
||||
slug = re.sub(r"-{2,}", "-", slug)
|
||||
return slug.strip("-")
|
||||
|
||||
def _entities_dir(self) -> Path:
|
||||
return self.example_dir / "output" / "entities"
|
||||
|
||||
def _list_existing_entity_names(self) -> list[str]:
|
||||
"""Return sorted slugs of all canonical entity files already on disk."""
|
||||
return sorted(
|
||||
f.stem
|
||||
for f in self._entities_dir().glob("*.md")
|
||||
if not f.name.endswith("-entities.md")
|
||||
and not f.name.endswith("-prompt.md")
|
||||
)
|
||||
|
||||
def _split_entities(
|
||||
self, combined_content: str
|
||||
) -> list[tuple[str, Path]]:
|
||||
"""Split combined LLM output into the flat canonical entity directory.
|
||||
|
||||
Writes each entity to ``output/entities/<slug>.md``. If a file
|
||||
with that slug already exists it is **skipped** (first-occurrence
|
||||
wins), but the entity is still included in the returned list so
|
||||
the chapter view can reference it.
|
||||
|
||||
Returns list of (entity_name, file_path) for every entity in
|
||||
*combined_content* (new and pre-existing alike).
|
||||
"""
|
||||
entities_dir = self._entities_dir()
|
||||
entities_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
parts = re.split(
|
||||
r"^---\s*ENTITY:\s*(.+?)\s*---\s*$",
|
||||
combined_content,
|
||||
flags=re.MULTILINE,
|
||||
)
|
||||
|
||||
entity_files: list[tuple[str, Path]] = []
|
||||
new_count = 0
|
||||
skipped_count = 0
|
||||
|
||||
for i in range(1, len(parts), 2):
|
||||
entity_name = parts[i]
|
||||
entity_content = parts[i + 1].strip() if i + 1 < len(parts) else ""
|
||||
|
||||
slug = self._normalize_entity_name(entity_name)
|
||||
if not slug:
|
||||
continue
|
||||
|
||||
file_path = entities_dir / f"{slug}.md"
|
||||
if file_path.exists():
|
||||
skipped_count += 1
|
||||
else:
|
||||
file_path.write_text(entity_content + "\n")
|
||||
new_count += 1
|
||||
|
||||
entity_files.append((entity_name, file_path))
|
||||
|
||||
msg = f" {new_count} new entities written"
|
||||
if skipped_count:
|
||||
msg += f", {skipped_count} pre-existing (skipped)"
|
||||
print(msg)
|
||||
return entity_files
|
||||
|
||||
def _write_chapter_entity_view(
|
||||
self, chapter_id: str, entity_files: list[tuple[str, Path]]
|
||||
) -> Path:
|
||||
"""Write a per-chapter view file that transcludes individual entities."""
|
||||
parts = chapter_id.split("-")
|
||||
book_num = int(parts[1]) if len(parts) >= 2 else 1
|
||||
ch_num = int(parts[3]) if len(parts) >= 4 else 0
|
||||
roman = {1: "I", 2: "II", 3: "III", 4: "IV", 5: "V"}.get(book_num, str(book_num))
|
||||
title = f"# Economic Entities — Book {roman}, Chapter {ch_num}\n"
|
||||
|
||||
lines = [title]
|
||||
for _name, file_path in entity_files:
|
||||
lines.append(f'{{{{ include "{file_path.name}" }}}}')
|
||||
lines.append("")
|
||||
lines.append("---")
|
||||
lines.append("")
|
||||
|
||||
# Remove trailing separator after last entity
|
||||
if lines and lines[-1] == "" and len(lines) >= 3 and lines[-2] == "---":
|
||||
lines = lines[:-2]
|
||||
|
||||
view_path = self._entities_dir() / f"{chapter_id}-entities.md"
|
||||
view_path.write_text("\n".join(lines) + "\n")
|
||||
print(f" Chapter view written to {view_path.name}")
|
||||
return view_path
|
||||
|
||||
def _read_entities_from_view(
|
||||
self, chapter_id: str
|
||||
) -> tuple[str, list[tuple[str, Path]]]:
|
||||
"""Reconstruct combined entity content from a chapter view file.
|
||||
|
||||
Parses ``{{ include "..." }}`` directives in the view to discover
|
||||
which canonical entity files belong to this chapter, reads them,
|
||||
and rebuilds the delimited combined content needed by downstream
|
||||
stages.
|
||||
"""
|
||||
from markitect.packaging.transclusion.directives import DirectiveParser
|
||||
|
||||
view_path = self._entities_dir() / f"{chapter_id}-entities.md"
|
||||
view_content = view_path.read_text()
|
||||
includes = DirectiveParser.extract_file_includes(view_content)
|
||||
|
||||
entities_dir = self._entities_dir()
|
||||
entity_files: list[tuple[str, Path]] = []
|
||||
parts: list[str] = []
|
||||
|
||||
for rel_path in includes:
|
||||
file_path = entities_dir / rel_path
|
||||
if not file_path.exists():
|
||||
continue
|
||||
slug = file_path.stem
|
||||
body = file_path.read_text().strip()
|
||||
parts.append(f"--- ENTITY: {slug} ---\n\n{body}")
|
||||
entity_files.append((slug, file_path))
|
||||
|
||||
combined = "\n\n".join(parts) + "\n" if parts else ""
|
||||
return combined, entity_files
|
||||
|
||||
# ── Pipeline Stages ──────────────────────────────────────────────
|
||||
|
||||
def stage_extract_entities(self, chapter_id: str, chapter_content: str) -> Optional[str]:
|
||||
"""Stage 1: Extract economic entities from a chapter."""
|
||||
"""Stage 1: Extract economic entities from a chapter.
|
||||
|
||||
Canonical entity files live in a **flat** directory
|
||||
(``output/entities/<slug>.md``). Duplicates across chapters are
|
||||
skipped — first occurrence wins. The per-chapter view file
|
||||
(``<chapter_id>-entities.md``) is a **secondary** transclusion view
|
||||
that ``{{ include }}``s each entity relevant to the chapter.
|
||||
"""
|
||||
print(f" [1/3] Extracting entities...")
|
||||
|
||||
# Bind the chapter content to the macro name
|
||||
self.bind_macro_artifact(self.spaces["sources"], "chapter_text", chapter_content)
|
||||
|
||||
macros = [
|
||||
self._macro("chapter_text"),
|
||||
self._macro("extraction_rules"),
|
||||
self._macro("vsm_framework"),
|
||||
]
|
||||
# Bind existing entity list so the LLM knows what already exists
|
||||
existing = self._list_existing_entity_names()
|
||||
if existing:
|
||||
entity_list = "\n".join(f"- {name}" for name in existing)
|
||||
else:
|
||||
entity_list = "(none — this is the first chapter)"
|
||||
self.bind_macro_artifact(
|
||||
self.spaces["entities"], "existing_entities", entity_list
|
||||
)
|
||||
|
||||
prompt = self.resolve_and_compile(
|
||||
"extract-entities", macros, ["sources", "guidelines", "vsm-reference"]
|
||||
"extract-entities",
|
||||
["sources", "guidelines", "vsm-reference", "entities"],
|
||||
)
|
||||
if not prompt:
|
||||
return None
|
||||
|
||||
# Write compiled prompt for inspection / LLM execution
|
||||
prompt_file = self.example_dir / "output" / "entities" / f"{chapter_id}-prompt.md"
|
||||
# Write compiled prompt for inspection
|
||||
prompt_file = self._entities_dir() / f"{chapter_id}-prompt.md"
|
||||
prompt_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
prompt_file.write_text(prompt)
|
||||
print(f" Prompt written to {prompt_file.relative_to(self.example_dir)}")
|
||||
|
||||
# Check for existing output (manual or LLM-generated)
|
||||
output_file = self.example_dir / "output" / "entities" / f"{chapter_id}-entities.md"
|
||||
if output_file.exists():
|
||||
content = output_file.read_text()
|
||||
view_file = self._entities_dir() / f"{chapter_id}-entities.md"
|
||||
|
||||
# ── PRIMARY: chapter view with transclusion already on disk ──
|
||||
if view_file.exists() and "{{ include" in view_file.read_text():
|
||||
content, entity_files = self._read_entities_from_view(chapter_id)
|
||||
self.store_output_artifact(
|
||||
self.spaces["entities"],
|
||||
f"{chapter_id}-entities",
|
||||
content,
|
||||
ArtifactType.GENERATED,
|
||||
)
|
||||
print(f" Found existing output: {output_file.name}")
|
||||
print(f" Found chapter view referencing {len(entity_files)} entities")
|
||||
return content
|
||||
|
||||
# Auto-generate via LLM if adapter is available
|
||||
if self.llm_adapter and prompt:
|
||||
content = self._execute_llm(prompt, output_file, "entities")
|
||||
if content:
|
||||
# ── MIGRATION: per-chapter subdirectory (previous format) ──
|
||||
subdir = self._entities_dir() / chapter_id
|
||||
if subdir.is_dir() and list(subdir.glob("*.md")):
|
||||
print(f" Migrating per-chapter subdir: {chapter_id}/")
|
||||
entity_files: list[tuple[str, Path]] = []
|
||||
entities_dir = self._entities_dir()
|
||||
for src in sorted(subdir.glob("*.md")):
|
||||
dest = entities_dir / src.name
|
||||
if not dest.exists():
|
||||
src.rename(dest)
|
||||
entity_files.append((src.stem, dest))
|
||||
# Clean up empty subdir
|
||||
if not list(subdir.glob("*")):
|
||||
subdir.rmdir()
|
||||
self._write_chapter_entity_view(chapter_id, entity_files)
|
||||
content = self._read_entities_from_view(chapter_id)[0]
|
||||
self.store_output_artifact(
|
||||
self.spaces["entities"],
|
||||
f"{chapter_id}-entities",
|
||||
content,
|
||||
ArtifactType.GENERATED,
|
||||
)
|
||||
return content
|
||||
|
||||
# ── MIGRATION: legacy combined file (pre-split format) ──
|
||||
if view_file.exists():
|
||||
raw = view_file.read_text()
|
||||
if "--- ENTITY:" in raw:
|
||||
print(f" Migrating legacy combined file: {view_file.name}")
|
||||
entity_files = self._split_entities(raw)
|
||||
self._write_chapter_entity_view(chapter_id, entity_files)
|
||||
self.store_output_artifact(
|
||||
self.spaces["entities"],
|
||||
f"{chapter_id}-entities",
|
||||
content,
|
||||
raw,
|
||||
ArtifactType.GENERATED,
|
||||
)
|
||||
return content
|
||||
return raw
|
||||
|
||||
print(f" Awaiting output at: {output_file.relative_to(self.example_dir)}")
|
||||
# ── GENERATE: call LLM, persist individual files first ──
|
||||
if self.llm_adapter and prompt:
|
||||
combined = self._call_llm(prompt, "entities")
|
||||
if combined:
|
||||
entity_files = self._split_entities(combined)
|
||||
self._write_chapter_entity_view(chapter_id, entity_files)
|
||||
self.store_output_artifact(
|
||||
self.spaces["entities"],
|
||||
f"{chapter_id}-entities",
|
||||
combined,
|
||||
ArtifactType.GENERATED,
|
||||
)
|
||||
return combined
|
||||
|
||||
print(f" Awaiting entity files in: output/entities/")
|
||||
return None
|
||||
|
||||
def stage_map_to_vsm(self, chapter_id: str, entities_content: str) -> Optional[str]:
|
||||
@@ -344,14 +514,8 @@ class ChapterProcessor:
|
||||
|
||||
self.bind_macro_artifact(self.spaces["entities"], "entities", entities_content)
|
||||
|
||||
macros = [
|
||||
self._macro("entities"),
|
||||
self._macro("vsm_framework"),
|
||||
self._macro("mapping_rules"),
|
||||
]
|
||||
|
||||
prompt = self.resolve_and_compile(
|
||||
"map-to-vsm", macros, ["entities", "vsm-reference", "guidelines"]
|
||||
"map-to-vsm", ["entities", "vsm-reference", "guidelines"]
|
||||
)
|
||||
if not prompt:
|
||||
return None
|
||||
@@ -396,16 +560,8 @@ class ChapterProcessor:
|
||||
self.bind_macro_artifact(self.spaces["entities"], "entities", entities_content)
|
||||
self.bind_macro_artifact(self.spaces["mappings"], "mappings", mappings_content)
|
||||
|
||||
macros = [
|
||||
self._macro("chapter_text"),
|
||||
self._macro("entities"),
|
||||
self._macro("mappings"),
|
||||
self._macro("vsm_framework"),
|
||||
]
|
||||
|
||||
prompt = self.resolve_and_compile(
|
||||
"synthesize-analysis",
|
||||
macros,
|
||||
["sources", "entities", "mappings", "vsm-reference"],
|
||||
)
|
||||
if not prompt:
|
||||
@@ -462,13 +618,8 @@ class ChapterProcessor:
|
||||
|
||||
self.bind_macro_artifact(self.spaces["analyses"], "all_analyses", combined)
|
||||
|
||||
macros = [
|
||||
self._macro("all_analyses"),
|
||||
self._macro("vsm_framework"),
|
||||
]
|
||||
|
||||
prompt = self.resolve_and_compile(
|
||||
"assess-metrics", macros, ["analyses", "vsm-reference"]
|
||||
"assess-metrics", ["analyses", "vsm-reference"]
|
||||
)
|
||||
if not prompt:
|
||||
return None
|
||||
@@ -615,11 +766,20 @@ class ChapterProcessor:
|
||||
print(f" {'-'*30} {'-'*12} {'-'*12} {'-'*12}")
|
||||
|
||||
for ch in chapters:
|
||||
entities = "done" if (self.example_dir / "output" / "entities" / f"{ch}-entities.md").exists() else "-"
|
||||
view_file = self._entities_dir() / f"{ch}-entities.md"
|
||||
entity_count = 0
|
||||
if view_file.exists() and "{{ include" in view_file.read_text():
|
||||
from markitect.packaging.transclusion.directives import DirectiveParser
|
||||
entity_count = len(DirectiveParser.extract_file_includes(view_file.read_text()))
|
||||
entities = f"done ({entity_count})" if entity_count else "-"
|
||||
mappings = "done" if (self.example_dir / "output" / "mappings" / f"{ch}-mappings.md").exists() else "-"
|
||||
analysis = "done" if (self.example_dir / "output" / "analyses" / f"{ch}-analysis.md").exists() else "-"
|
||||
print(f" {ch:<30} {entities:<12} {mappings:<12} {analysis:<12}")
|
||||
|
||||
total_entities = len(self._list_existing_entity_names())
|
||||
if total_entities:
|
||||
print(f"\n Canonical entity set: {total_entities} unique entities")
|
||||
|
||||
# ── Statistics ───────────────────────────────────────────────────
|
||||
|
||||
def show_stats(self):
|
||||
|
||||
Reference in New Issue
Block a user