kontextual-engine/tests/test_markitect_tool_capacity.py

import importlib.util
import os
import time
from pathlib import Path

import pytest


pytestmark = [pytest.mark.integration, pytest.mark.markitect_tool, pytest.mark.capacity]
if importlib.util.find_spec("markitect_tool") is None:
    pytestmark.append(
        pytest.mark.skip(
            reason="Install kontextual-engine[markdown] to run markitect-tool capacity tests."
        )
    )
    mkt = None
elif os.environ.get("KONTEXTUAL_RUN_CAPACITY", "").lower() not in {"1", "true", "yes"}:
    pytestmark.append(
        pytest.mark.skip(
            reason="Set KONTEXTUAL_RUN_CAPACITY=1 to run opt-in capacity sentinels."
        )
    )
    mkt = None
else:
    import markitect_tool as mkt


def test_large_markdown_parse_query_and_extract_capacity() -> None:
    markdown = _large_decision_markdown(section_count=650)

    elapsed, document = _timed(lambda: mkt.parse_markdown(markdown, source_path="large.md"))
    _assert_within("parse 650-section markdown", elapsed, seconds=6.0)

    elapsed, matches = _timed(lambda: mkt.query_document(document, "sections[heading=Decision 640]"))
    _assert_within("query exact section in 650-section markdown", elapsed, seconds=2.0)

    elapsed, extracted = _timed(lambda: mkt.extract_document(document, "sections[heading=Decision 640]"))
    _assert_within("extract exact section in 650-section markdown", elapsed, seconds=2.0)

    assert len(document.sections) == 651
    assert len(document.headings) == 651
    assert len(matches) == 1
    assert "CAPACITY-MARKER-640" in extracted[0]


def test_repeated_selectors_over_large_document_capacity() -> None:
    document = mkt.parse_markdown(_large_decision_markdown(section_count=420))
    selectors = [
        "frontmatter.status",
        "headings[level=2]",
        "blocks[type=bullet_list]",
        "sections[contains~=CAPACITY-MARKER-120]",
        "sections[heading=Decision 240]",
        "metrics.document.sections",
    ]

    def run_queries() -> list[int]:
        counts = []
        for _ in range(12):
            for selector in selectors:
                counts.append(len(mkt.query_document(document, selector)))
        return counts

    elapsed, counts = _timed(run_queries)

    _assert_within("72 selector queries over 420-section markdown", elapsed, seconds=5.0)
    assert min(counts) >= 1
    assert max(counts) >= 420


def test_include_fanout_compose_and_transform_capacity(tmp_path: Path) -> None:
    partials = []
    for index in range(90):
        partial = tmp_path / f"partial-{index:03}.md"
        partial.write_text(_partial_markdown(index), encoding="utf-8")
        partials.append(partial)
    bundle = tmp_path / "bundle.md"
    bundle.write_text(
        "\n".join(
            f'<!-- mkt:include path="{partial.name}" selector="sections[heading=Include Target]" heading_delta="1" -->'
            for partial in partials
        ),
        encoding="utf-8",
    )

    elapsed, included = _timed(
        lambda: mkt.resolve_includes(
            bundle.read_text(encoding="utf-8"),
            base_dir=tmp_path,
            current_path=bundle,
        )
    )
    _assert_within("resolve 90 include fan-out bundle", elapsed, seconds=8.0)

    elapsed, composed = _timed(lambda: mkt.compose_files(partials, title="Capacity Bundle", heading_delta=1))
    _assert_within("compose 90 markdown partials", elapsed, seconds=5.0)

    elapsed, transformed = _timed(
        lambda: mkt.transform_markdown(
            included.markdown,
            set_frontmatter={"status": "capacity-check"},
            heading_delta=1,
            source_path=str(bundle),
        )
    )
    _assert_within("transform resolved include fan-out bundle", elapsed, seconds=5.0)

    assert len(included.included_paths) == 90
    assert "### Include Target" in included.markdown
    assert composed.markdown.startswith("# Capacity Bundle")
    assert "status: capacity-check" in transformed.markdown


def test_context_package_many_sources_policy_filtering_capacity(tmp_path: Path) -> None:
    sources = []
    for index in range(140):
        source = tmp_path / f"source-{index:03}.md"
        label = "public" if index % 2 == 0 else "internal"
        source.write_text(_context_source_markdown(index, label), encoding="utf-8")
        sources.append(source)
    gateway = mkt.LocalLabelPolicyGateway(
        {
            "id": "capacity-policy",
            "subjects": {
                "reader": {
                    "allowed_labels": ["public"],
                    "allowed_actions": ["read", "activate"],
                }
            },
            "default_subject": "reader",
        }
    )

    elapsed, package = _timed(
        lambda: mkt.create_context_package_from_sources(
            "sections[heading=Decision]",
            sources,
            root=tmp_path,
            namespace=mkt.MemoryNamespace(project="kontextual-engine", task="capacity"),
            budget=mkt.ContextBudget(max_items=160),
        )
    )
    _assert_within("create context package from 140 markdown sources", elapsed, seconds=12.0)

    elapsed, activation = _timed(
        lambda: mkt.activate_context_package(
            package,
            policy_gateway=gateway,
            subject="reader",
        )
    )
    _assert_within("activate and policy-filter 140-source context package", elapsed, seconds=6.0)

    assert len(package.items) == 140
    assert len(activation.items) == 70
    assert "PUBLIC-CAPACITY-000" in activation.content
    assert "INTERNAL-CAPACITY-001" not in activation.content
    assert activation.policy["summary"]["denied"] == 70


def test_snapshot_identity_many_files_capacity(tmp_path: Path) -> None:
    paths = []
    for index in range(120):
        path = tmp_path / f"snapshot-{index:03}.md"
        path.write_text(_context_source_markdown(index, "public"), encoding="utf-8")
        paths.append(path)

    elapsed, identities = _timed(lambda: [mkt.snapshot_identity_for_file(path) for path in paths])
    _assert_within("compute 120 markdown snapshot identities", elapsed, seconds=4.0)

    assert len({identity.snapshot_id for identity in identities}) == 120
    assert all(identity.content_hash.startswith("sha256:") for identity in identities)


def _large_decision_markdown(section_count: int) -> str:
    sections = [
        "---",
        "document_type: capacity-fixture",
        "status: active",
        "owner: Platform Knowledge",
        "---",
        "",
        "# Capacity Fixture",
        "",
    ]
    for index in range(section_count):
        sections.extend(
            [
                f"## Decision {index}",
                "",
                (
                    f"CAPACITY-MARKER-{index} records a synthetic decision section "
                    "with enough text to exercise parsing, selector matching, and extraction."
                ),
                "",
                "- Parser shape must stay stable.",
                "- Selector scans must remain bounded enough for adapter use.",
                "",
            ]
        )
    return "\n".join(sections)


def _partial_markdown(index: int) -> str:
    return "\n".join(
        [
            f"# Partial {index}",
            "",
            "## Include Target",
            "",
            f"Included capacity text {index}.",
            "",
            "## Ignore",
            "",
            "This section should not be selected by the include resolver.",
            "",
        ]
    )


def _context_source_markdown(index: int, label: str) -> str:
    marker = f"{label.upper()}-CAPACITY-{index:03}"
    return "\n".join(
        [
            "---",
            "document_type: capacity-source",
            f"status: {'active' if label == 'public' else 'draft'}",
            "policy:",
            f"  labels: [{label}]",
            "---",
            "",
            f"# Capacity Source {index}",
            "",
            "## Decision",
            "",
            f"{marker} uses Markitect context packaging for generated source {index}.",
            "",
        ]
    )


def _timed(operation):
    start = time.perf_counter()
    value = operation()
    return time.perf_counter() - start, value


def _assert_within(name: str, elapsed: float, *, seconds: float) -> None:
    assert elapsed <= seconds, f"{name} took {elapsed:.3f}s, expected <= {seconds:.3f}s"