generated from coulomb/repo-seed
Enrich reports/generation-summary.md with the review-oriented sections that the 2026-05-17 smoke run flagged as missing: ## Chapter coverage (per-chapter source/entity/relation/anchor counts), ## Entities (the deduped title list), ## Unmapped source chunks (sources with no downstream generated artifact), and ## Page anchors (total plus deterministic sample). Sections are conditional on data being present so generic non-Lefevre runs stay terse. Add docs/lefevre-readiness.md as the final sign-off document for IB-WP-0016: what is wired (T01-T06 recap), an output policy table (checked-in fixture sources vs disposable generated infospaces vs archive targets), a seven-item reviewer checklist (duplicate entities, relation endpoints, weak evidence, overgeneralization, anchor coverage, unmapped sources, plan-vs-actual variance), a scale-up plan from one-chapter to full-book, and the load-bearing risks still outstanding (cross-chunk dedup, whole-run resume, adaptive routing deferred to LLM-WP-0004 / IB-WP-0018, rate-table drift). Closes IB-WP-0016 (Lefevre EPUB3 Infospace Readiness Pilot): T01-T07 all done; the workplan is set to status=done. 131 tests pass, 1 skipped (live OpenRouter smoke, correctly gated). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
213 lines
7.8 KiB
Python
213 lines
7.8 KiB
Python
import json
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
import zipfile
|
|
from pathlib import Path
|
|
|
|
from infospace_bench.budget import (
|
|
read_plan_snapshots,
|
|
read_usage_runs,
|
|
)
|
|
from infospace_bench.generator import (
|
|
init_generation_infospace,
|
|
plan_generation,
|
|
run_generation,
|
|
status_generation,
|
|
)
|
|
from infospace_bench.source_intake import (
|
|
SECTION_ROLE_BODY,
|
|
SECTION_ROLE_COVER,
|
|
SECTION_ROLE_FOOTER,
|
|
SECTION_ROLE_HEADER,
|
|
SECTION_ROLE_LICENSE,
|
|
SECTION_ROLE_NAV,
|
|
SECTION_ROLE_NOTES,
|
|
normalize_source,
|
|
)
|
|
|
|
|
|
FIXTURE_ROOT = Path(__file__).parent / "fixtures" / "lefevre"
|
|
FIXTURE_SOURCES = FIXTURE_ROOT / "sources"
|
|
FIXTURE_RESPONSES = FIXTURE_ROOT / "responses.yaml"
|
|
|
|
|
|
def _build_fixture_epub(target: Path) -> Path:
|
|
"""Assemble the checked-in Lefevre fixture sources into a single EPUB zip."""
|
|
layout = {
|
|
"mimetype": "application/epub+zip",
|
|
"META-INF/container.xml": (FIXTURE_SOURCES / "container.xml").read_text(encoding="utf-8"),
|
|
}
|
|
for source in sorted(FIXTURE_SOURCES.glob("*.xhtml")):
|
|
layout[f"OEBPS/{source.name}"] = source.read_text(encoding="utf-8")
|
|
layout["OEBPS/content.opf"] = (FIXTURE_SOURCES / "content.opf").read_text(encoding="utf-8")
|
|
with zipfile.ZipFile(target, "w") as archive:
|
|
for path_in_zip, contents in layout.items():
|
|
archive.writestr(path_in_zip, contents)
|
|
return target
|
|
|
|
|
|
def test_lefevre_fixture_builds_a_complete_infospace(tmp_path: Path) -> None:
|
|
book = _build_fixture_epub(tmp_path / "lefevre.epub")
|
|
|
|
infospace = init_generation_infospace(
|
|
tmp_path,
|
|
book,
|
|
"lefevre-fixture",
|
|
name="Reminiscences of a Stock Operator (Fixture)",
|
|
profile="trading-literature",
|
|
)
|
|
plan_generation(infospace.root)
|
|
result = run_generation(infospace.root, fixture_responses=FIXTURE_RESPONSES)
|
|
status = status_generation(infospace.root)
|
|
|
|
assert result.status == "completed"
|
|
assert status["profile"] == "trading-literature"
|
|
# Three body chapters in the fixture spine; cover/nav/header/notes/license/footer are excluded by default.
|
|
assert status["source_chunk_count"] == 3
|
|
assert status["entity_count"] >= 1
|
|
assert status["relation_count"] >= 1
|
|
assert status["evaluation_count"] >= 1
|
|
assert status["history_snapshot_count"] >= 1
|
|
|
|
# Stable chapter-NN source filenames from the IB-WP-0016 T02 work.
|
|
expected_sources = {"chapter-01.md", "chapter-02.md", "chapter-03.md"}
|
|
actual_sources = {
|
|
path.name
|
|
for path in (infospace.root / "artifacts" / "sources").glob("*.md")
|
|
}
|
|
assert expected_sources == actual_sources
|
|
|
|
# Manifest-backed artifacts: entities, relations, evaluations, metrics, history, report
|
|
assert (infospace.root / "artifacts" / "entities").is_dir()
|
|
assert (infospace.root / "artifacts" / "relations").is_dir()
|
|
assert any((infospace.root / "output" / "evaluations").glob("*.md"))
|
|
assert (infospace.root / "output" / "metrics" / "metrics.yaml").is_file()
|
|
assert (infospace.root / "output" / "metrics" / "history.yaml").is_file()
|
|
assert (infospace.root / "reports" / "generation-summary.md").is_file()
|
|
|
|
# Budget registry artifacts (IB-WP-0019) should land alongside the run.
|
|
assert read_plan_snapshots(infospace.root), "plan snapshot must persist"
|
|
runs = read_usage_runs(infospace.root)
|
|
assert runs and runs[0]["snapshot_id"] == read_plan_snapshots(infospace.root)[-1]["snapshot_id"]
|
|
|
|
# Book provenance plumb-through: every source artifact knows the chapter it came from.
|
|
import yaml as _yaml
|
|
|
|
index = _yaml.safe_load((infospace.root / "artifacts" / "index.yaml").read_text(encoding="utf-8"))
|
|
chapter_numbers = sorted(
|
|
item["provenance"]["chapter_number"]
|
|
for item in index["artifacts"]
|
|
if item["kind"] == "source"
|
|
)
|
|
assert chapter_numbers == [1, 2, 3]
|
|
|
|
|
|
def test_lefevre_fixture_excludes_gutenberg_boilerplate_by_default(tmp_path: Path) -> None:
|
|
book = _build_fixture_epub(tmp_path / "lefevre.epub")
|
|
|
|
default_chunks = normalize_source(book)
|
|
include_all_chunks = normalize_source(book, include_non_body=True)
|
|
|
|
# Default: only the three body chapters survive.
|
|
assert [chunk.chapter_label for chunk in default_chunks] == ["I", "II", "III"]
|
|
assert {chunk.section_role for chunk in default_chunks} == {SECTION_ROLE_BODY}
|
|
|
|
# include_non_body: cover, nav, PG header, notes, license, footer all appear.
|
|
roles = {chunk.section_role for chunk in include_all_chunks}
|
|
assert SECTION_ROLE_COVER in roles
|
|
assert SECTION_ROLE_NAV in roles
|
|
assert SECTION_ROLE_HEADER in roles
|
|
assert SECTION_ROLE_NOTES in roles
|
|
assert SECTION_ROLE_LICENSE in roles
|
|
assert SECTION_ROLE_FOOTER in roles
|
|
|
|
|
|
def test_generation_report_includes_review_sections(tmp_path: Path) -> None:
|
|
book = _build_fixture_epub(tmp_path / "lefevre.epub")
|
|
infospace = init_generation_infospace(
|
|
tmp_path,
|
|
book,
|
|
"lefevre-review",
|
|
name="Lefevre Review",
|
|
profile="trading-literature",
|
|
)
|
|
plan_generation(infospace.root)
|
|
run_generation(infospace.root, fixture_responses=FIXTURE_RESPONSES)
|
|
|
|
report = (infospace.root / "reports" / "generation-summary.md").read_text(encoding="utf-8")
|
|
|
|
assert "## Chapter coverage" in report
|
|
assert "Chapter 01 (I)" in report
|
|
assert "Chapter 02 (II)" in report
|
|
assert "Chapter 03 (III)" in report
|
|
assert "## Entities" in report
|
|
# The trading-literature fixture emits Larry Livingston, Bucket Shop, Tape Reading
|
|
assert "Larry Livingston" in report
|
|
assert "Bucket Shop" in report
|
|
assert "Tape Reading" in report
|
|
assert "## Page anchors" in report
|
|
assert "Page_1" in report
|
|
# All three chapters have generated artifacts → no unmapped section
|
|
assert "## Unmapped source chunks" not in report
|
|
|
|
|
|
def test_generation_report_flags_unmapped_sources(tmp_path: Path) -> None:
|
|
"""When entity extraction is skipped for some sources, the report calls it out."""
|
|
book = _build_fixture_epub(tmp_path / "lefevre.epub")
|
|
infospace = init_generation_infospace(
|
|
tmp_path,
|
|
book,
|
|
"lefevre-partial",
|
|
name="Lefevre Partial",
|
|
profile="trading-literature",
|
|
)
|
|
plan_generation(infospace.root)
|
|
# Run only the summary workflow — entity/relation generation is skipped.
|
|
run_generation(infospace.root, stage="summary", fixture_responses=FIXTURE_RESPONSES)
|
|
# Then write the report by re-running the all stage; sources that produced
|
|
# only summaries will still have downstream artifacts, so this case is fine.
|
|
# Verify the helper function directly with a known incomplete state.
|
|
from infospace_bench.generator import _collect_review_report
|
|
|
|
review = _collect_review_report(infospace.root)
|
|
assert review["chapter_coverage"], "chapter coverage rows must still be produced"
|
|
assert review["chapter_coverage"][0]["source_count"] >= 1
|
|
|
|
|
|
def test_lefevre_fixture_cli_end_to_end(tmp_path: Path) -> None:
|
|
book = _build_fixture_epub(tmp_path / "lefevre.epub")
|
|
env = os.environ.copy()
|
|
env["PYTHONPATH"] = "src:/home/worsch/markitect-tool/src"
|
|
|
|
result = subprocess.run(
|
|
[
|
|
sys.executable,
|
|
"-m",
|
|
"infospace_bench",
|
|
"generate",
|
|
"from-source",
|
|
str(book),
|
|
"--workspace",
|
|
str(tmp_path),
|
|
"--slug",
|
|
"lefevre-fixture-cli",
|
|
"--name",
|
|
"Lefevre Fixture (CLI)",
|
|
"--profile",
|
|
"trading-literature",
|
|
"--fixture-responses",
|
|
str(FIXTURE_RESPONSES),
|
|
"--apply",
|
|
],
|
|
check=False,
|
|
env=env,
|
|
text=True,
|
|
capture_output=True,
|
|
)
|
|
|
|
assert result.returncode == 0, result.stderr
|
|
payload = json.loads(result.stdout)
|
|
assert payload["status"] == "completed"
|
|
assert "lefevre-fixture-cli" in payload["root"]
|