infospace-bench/tests/test_generic_generator.py

import json
import os
import subprocess
import sys
import zipfile
from pathlib import Path

import yaml

from infospace_bench.generator import (
    init_generation_infospace,
    run_generation,
    status_generation,
)
from infospace_bench.openrouter import OpenRouterAssistedGenerationAdapter
from infospace_bench.source_intake import normalize_source


def cli_env() -> dict[str, str]:
    env = os.environ.copy()
    env["PYTHONPATH"] = "src:/home/worsch/markitect-tool/src"
    return env


def fixture_responses(path: Path) -> None:
    data = {
        "responses": [
            {
                "stage_id": "summarize-source",
                "input_artifact_id": "*",
                "markdown": "# Source Summary\n\nThe source describes reusable knowledge work.\n",
            },
            {
                "stage_id": "extract-entities",
                "input_artifact_id": "*",
                "markdown": (
                    "# Knowledge Artifact\n\n"
                    "## Definition\n\n"
                    "A durable unit of structured knowledge derived from a source.\n\n"
                    "## Context\n\n"
                    "Generated from a generic source workflow.\n\n"
                    "# Source Claim\n\n"
                    "## Definition\n\n"
                    "A claim preserved from the source for later review.\n\n"
                    "## Context\n\n"
                    "Used to keep provenance visible.\n"
                ),
            },
            {
                "stage_id": "extract-relations",
                "input_artifact_id": "*",
                "markdown": (
                    "# Knowledge Artifact Supports Source Claim\n\n"
                    "## Subject\n\n"
                    "Knowledge Artifact\n\n"
                    "## Predicate\n\n"
                    "supports\n\n"
                    "## Object\n\n"
                    "Source Claim\n\n"
                    "## Relation Type\n\n"
                    "support\n\n"
                    "## Evidence\n\n"
                    "The source links durable artifacts to explicit claims.\n"
                ),
            },
            {
                "stage_id": "evaluate-entity",
                "input_artifact_id": "*",
                "markdown": (
                    "---\n"
                    "artifact_id: entity/knowledge-artifact.md\n"
                    "evaluator: fixture\n"
                    "evaluated_at: '2026-05-14T00:00:00'\n"
                    "scores:\n"
                    "  - name: groundedness\n"
                    "    value: 4.0\n"
                    "    max_value: 5.0\n"
                    "  - name: usefulness\n"
                    "    value: 4.0\n"
                    "    max_value: 5.0\n"
                    "---\n"
                    "\n"
                    "# Evaluation: entity/knowledge-artifact.md\n"
                ),
            },
        ]
    }
    path.write_text(yaml.safe_dump(data, sort_keys=False), encoding="utf-8")


def write_epub_fixture(path: Path) -> None:
    with zipfile.ZipFile(path, "w") as archive:
        archive.writestr("OEBPS/chapter1.xhtml", "<h1>Chapter One</h1><p>Alpha beta.</p>")
        archive.writestr("OEBPS/chapter2.xhtml", "<h1>Chapter Two</h1><p>Gamma delta.</p>")


def test_source_intake_accepts_article_ebook_and_folder(tmp_path: Path) -> None:
    article = tmp_path / "article.html"
    article.write_text(
        "<html><head><title>Article Title</title></head>"
        "<body><h1>Article Title</h1><p>One two three.</p></body></html>",
        encoding="utf-8",
    )
    ebook = tmp_path / "book.epub"
    write_epub_fixture(ebook)
    folder = tmp_path / "collection"
    folder.mkdir()
    (folder / "note.md").write_text("# Note\n\nMarkdown source.", encoding="utf-8")
    (folder / "memo.txt").write_text("Plain text source.", encoding="utf-8")

    article_chunks = normalize_source(article)
    ebook_chunks = normalize_source(ebook)
    folder_chunks = normalize_source(folder)

    assert article_chunks[0].source_type == "html"
    assert article_chunks[0].title == "Article Title"
    assert article_chunks[0].chunk_id == "article-title"
    assert article_chunks[0].digest == normalize_source(article)[0].digest
    assert [chunk.source_type for chunk in ebook_chunks] == ["epub", "epub"]
    assert {chunk.source_type for chunk in folder_chunks} == {"markdown", "text"}
    assert all(chunk.markdown.startswith("# ") for chunk in folder_chunks)


def test_generate_from_source_cli_fixture_builds_infospace(tmp_path: Path) -> None:
    source = tmp_path / "article.md"
    source.write_text(
        "# Reusable Knowledge\n\nA source about claims and durable artifacts.",
        encoding="utf-8",
    )
    fixture = tmp_path / "responses.yaml"
    fixture_responses(fixture)

    result = subprocess.run(
        [
            sys.executable,
            "-m",
            "infospace_bench",
            "generate",
            "from-source",
            str(source),
            "--workspace",
            str(tmp_path),
            "--slug",
            "article-space",
            "--name",
            "Article Space",
            "--fixture-responses",
            str(fixture),
            "--apply",
        ],
        check=False,
        env=cli_env(),
        text=True,
        capture_output=True,
    )
    assert result.returncode == 0, result.stderr
    payload = json.loads(result.stdout)
    root = Path(payload["root"])
    status = subprocess.run(
        [
            sys.executable,
            "-m",
            "infospace_bench",
            "generate",
            "status",
            str(root),
        ],
        check=False,
        env=cli_env(),
        text=True,
        capture_output=True,
    )
    assert status.returncode == 0, status.stderr
    status_payload = json.loads(status.stdout)

    assert payload["status"] == "completed"
    assert (root / "artifacts" / "sources" / "reusable-knowledge.md").is_file()
    assert (root / "artifacts" / "entities" / "knowledge-artifact.md").is_file()
    assert (root / "artifacts" / "relations" / "reusable-knowledge-relations.md").is_file()
    assert (root / "output" / "metrics" / "metrics.yaml").is_file()
    assert status_payload["source_chunk_count"] == 1
    assert status_payload["entity_count"] == 2
    assert status_payload["relation_count"] == 1
    assert status_payload["stale"] is False


def test_generate_from_ebook_and_folder_fixtures(tmp_path: Path) -> None:
    fixture = tmp_path / "responses.yaml"
    fixture_responses(fixture)
    ebook = tmp_path / "book.epub"
    write_epub_fixture(ebook)
    folder = tmp_path / "folder"
    folder.mkdir()
    (folder / "first.md").write_text("# First\n\nOne source.", encoding="utf-8")
    (folder / "second.txt").write_text("Second source.", encoding="utf-8")

    for source, slug, expected_sources in (
        (ebook, "book-space", 2),
        (folder, "folder-space", 2),
    ):
        result = subprocess.run(
            [
                sys.executable,
                "-m",
                "infospace_bench",
                "generate",
                "from-source",
                str(source),
                "--workspace",
                str(tmp_path),
                "--slug",
                slug,
                "--name",
                slug.replace("-", " ").title(),
                "--fixture-responses",
                str(fixture),
                "--apply",
            ],
            check=False,
            env=cli_env(),
            text=True,
            capture_output=True,
        )
        assert result.returncode == 0, result.stderr
        payload = json.loads(result.stdout)
        status = status_generation(Path(payload["root"]))
        assert status["source_chunk_count"] == expected_sources
        assert status["entity_count"] == 2
        assert status["relation_count"] == expected_sources
        assert status["history_snapshot_count"] == 1


def test_generator_resume_is_idempotent_and_detects_stale_source(tmp_path: Path) -> None:
    source = tmp_path / "note.md"
    source.write_text("# Note\n\nInitial source.", encoding="utf-8")
    fixture = tmp_path / "responses.yaml"
    fixture_responses(fixture)
    root = init_generation_infospace(tmp_path, source, "note-space", name="Note Space").root

    first = run_generation(root, fixture_responses=fixture)
    second = run_generation(root, fixture_responses=fixture, resume=True)
    generated_source = root / "artifacts" / "sources" / "note.md"
    generated_source.write_text("# Note\n\nChanged source.", encoding="utf-8")
    stale_status = status_generation(root)

    assert first.status == "completed"
    assert second.status == "skipped"
    assert second.skipped is True
    assert stale_status["stale"] is True
    assert stale_status["stale_sources"] == ["source/note.md"]


def test_openrouter_adapter_uses_model_and_records_metadata() -> None:
    requests: list[dict] = []

    def transport(payload: dict, headers: dict[str, str], endpoint: str) -> dict:
        requests.append({"payload": payload, "headers": headers, "endpoint": endpoint})
        return {
            "id": "or-request-1",
            "choices": [{"message": {"content": "# Generated\n\nContent."}}],
            "usage": {"prompt_tokens": 5, "completion_tokens": 3},
        }

    adapter = OpenRouterAssistedGenerationAdapter(
        api_key="test-key",
        model="openai/gpt-4o-mini",
        transport=transport,
        retry_limit=0,
    )
    result = adapter.generate(
        type(
            "Request",
            (),
            {
                "prompt": "Generate markdown.",
                "stage_id": "extract-entities",
                "workflow_id": "generic-source-extract",
                "input_artifact_id": "source/example.md",
                "provider_hint": "openrouter",
                "metadata": {},
            },
        )()
    )

    assert requests[0]["payload"]["model"] == "openai/gpt-4o-mini"
    assert requests[0]["headers"]["Authorization"] == "Bearer test-key"
    assert result.markdown == "# Generated\n\nContent."
    assert result.provider == "openrouter"
    assert result.metadata["model"] == "openai/gpt-4o-mini"
    assert result.metadata["request_id"] == "or-request-1"
    assert result.metadata["usage"]["completion_tokens"] == 3


def test_generic_generator_docs_cover_openrouter_resume_and_cost_caps() -> None:
    text = Path("docs/generic-source-generator.md").read_text(encoding="utf-8")

    assert "OPENROUTER_API_KEY" in text
    assert "--model" in text
    assert "--max-chunks" in text
    assert "resume" in text.lower()
    assert "fixture-responses" in text