import json import os import subprocess import sys import zipfile from pathlib import Path from infospace_bench.budget import ( read_plan_snapshots, read_usage_runs, ) from infospace_bench.generator import ( init_generation_infospace, plan_generation, run_generation, status_generation, ) from infospace_bench.source_intake import ( SECTION_ROLE_BODY, SECTION_ROLE_COVER, SECTION_ROLE_FOOTER, SECTION_ROLE_HEADER, SECTION_ROLE_LICENSE, SECTION_ROLE_NAV, SECTION_ROLE_NOTES, normalize_source, ) FIXTURE_ROOT = Path(__file__).parent / "fixtures" / "lefevre" FIXTURE_SOURCES = FIXTURE_ROOT / "sources" FIXTURE_RESPONSES = FIXTURE_ROOT / "responses.yaml" def _build_fixture_epub(target: Path) -> Path: """Assemble the checked-in Lefevre fixture sources into a single EPUB zip.""" layout = { "mimetype": "application/epub+zip", "META-INF/container.xml": (FIXTURE_SOURCES / "container.xml").read_text(encoding="utf-8"), } for source in sorted(FIXTURE_SOURCES.glob("*.xhtml")): layout[f"OEBPS/{source.name}"] = source.read_text(encoding="utf-8") layout["OEBPS/content.opf"] = (FIXTURE_SOURCES / "content.opf").read_text(encoding="utf-8") with zipfile.ZipFile(target, "w") as archive: for path_in_zip, contents in layout.items(): archive.writestr(path_in_zip, contents) return target def test_lefevre_fixture_builds_a_complete_infospace(tmp_path: Path) -> None: book = _build_fixture_epub(tmp_path / "lefevre.epub") infospace = init_generation_infospace( tmp_path, book, "lefevre-fixture", name="Reminiscences of a Stock Operator (Fixture)", profile="trading-literature", ) plan_generation(infospace.root) result = run_generation(infospace.root, fixture_responses=FIXTURE_RESPONSES) status = status_generation(infospace.root) assert result.status == "completed" assert status["profile"] == "trading-literature" # Three body chapters in the fixture spine; cover/nav/header/notes/license/footer are excluded by default. assert status["source_chunk_count"] == 3 assert status["entity_count"] >= 1 assert status["relation_count"] >= 1 assert status["evaluation_count"] >= 1 assert status["history_snapshot_count"] >= 1 # Stable chapter-NN source filenames from the IB-WP-0016 T02 work. expected_sources = {"chapter-01.md", "chapter-02.md", "chapter-03.md"} actual_sources = { path.name for path in (infospace.root / "artifacts" / "sources").glob("*.md") } assert expected_sources == actual_sources # Manifest-backed artifacts: entities, relations, evaluations, metrics, history, report assert (infospace.root / "artifacts" / "entities").is_dir() assert (infospace.root / "artifacts" / "relations").is_dir() assert any((infospace.root / "output" / "evaluations").glob("*.md")) assert (infospace.root / "output" / "metrics" / "metrics.yaml").is_file() assert (infospace.root / "output" / "metrics" / "history.yaml").is_file() assert (infospace.root / "reports" / "generation-summary.md").is_file() # Budget registry artifacts (IB-WP-0019) should land alongside the run. assert read_plan_snapshots(infospace.root), "plan snapshot must persist" runs = read_usage_runs(infospace.root) assert runs and runs[0]["snapshot_id"] == read_plan_snapshots(infospace.root)[-1]["snapshot_id"] # Book provenance plumb-through: every source artifact knows the chapter it came from. import yaml as _yaml index = _yaml.safe_load((infospace.root / "artifacts" / "index.yaml").read_text(encoding="utf-8")) chapter_numbers = sorted( item["provenance"]["chapter_number"] for item in index["artifacts"] if item["kind"] == "source" ) assert chapter_numbers == [1, 2, 3] def test_lefevre_fixture_excludes_gutenberg_boilerplate_by_default(tmp_path: Path) -> None: book = _build_fixture_epub(tmp_path / "lefevre.epub") default_chunks = normalize_source(book) include_all_chunks = normalize_source(book, include_non_body=True) # Default: only the three body chapters survive. assert [chunk.chapter_label for chunk in default_chunks] == ["I", "II", "III"] assert {chunk.section_role for chunk in default_chunks} == {SECTION_ROLE_BODY} # include_non_body: cover, nav, PG header, notes, license, footer all appear. roles = {chunk.section_role for chunk in include_all_chunks} assert SECTION_ROLE_COVER in roles assert SECTION_ROLE_NAV in roles assert SECTION_ROLE_HEADER in roles assert SECTION_ROLE_NOTES in roles assert SECTION_ROLE_LICENSE in roles assert SECTION_ROLE_FOOTER in roles def test_generation_report_includes_review_sections(tmp_path: Path) -> None: book = _build_fixture_epub(tmp_path / "lefevre.epub") infospace = init_generation_infospace( tmp_path, book, "lefevre-review", name="Lefevre Review", profile="trading-literature", ) plan_generation(infospace.root) run_generation(infospace.root, fixture_responses=FIXTURE_RESPONSES) report = (infospace.root / "reports" / "generation-summary.md").read_text(encoding="utf-8") assert "## Chapter coverage" in report assert "Chapter 01 (I)" in report assert "Chapter 02 (II)" in report assert "Chapter 03 (III)" in report assert "## Entities" in report # The trading-literature fixture emits Larry Livingston, Bucket Shop, Tape Reading assert "Larry Livingston" in report assert "Bucket Shop" in report assert "Tape Reading" in report assert "## Page anchors" in report assert "Page_1" in report # All three chapters have generated artifacts → no unmapped section assert "## Unmapped source chunks" not in report def test_generation_report_flags_unmapped_sources(tmp_path: Path) -> None: """When entity extraction is skipped for some sources, the report calls it out.""" book = _build_fixture_epub(tmp_path / "lefevre.epub") infospace = init_generation_infospace( tmp_path, book, "lefevre-partial", name="Lefevre Partial", profile="trading-literature", ) plan_generation(infospace.root) # Run only the summary workflow — entity/relation generation is skipped. run_generation(infospace.root, stage="summary", fixture_responses=FIXTURE_RESPONSES) # Then write the report by re-running the all stage; sources that produced # only summaries will still have downstream artifacts, so this case is fine. # Verify the helper function directly with a known incomplete state. from infospace_bench.generator import _collect_review_report review = _collect_review_report(infospace.root) assert review["chapter_coverage"], "chapter coverage rows must still be produced" assert review["chapter_coverage"][0]["source_count"] >= 1 def test_lefevre_fixture_cli_end_to_end(tmp_path: Path) -> None: book = _build_fixture_epub(tmp_path / "lefevre.epub") env = os.environ.copy() env["PYTHONPATH"] = "src:/home/worsch/markitect-tool/src" result = subprocess.run( [ sys.executable, "-m", "infospace_bench", "generate", "from-source", str(book), "--workspace", str(tmp_path), "--slug", "lefevre-fixture-cli", "--name", "Lefevre Fixture (CLI)", "--profile", "trading-literature", "--fixture-responses", str(FIXTURE_RESPONSES), "--apply", ], check=False, env=env, text=True, capture_output=True, ) assert result.returncode == 0, result.stderr payload = json.loads(result.stdout) assert payload["status"] == "completed" assert "lefevre-fixture-cli" in payload["root"]