markitect-main/markitect/infospace/cli.py

"""
CLI commands for infospace lifecycle management.

Provides ``markitect infospace`` subcommands for initialising,
inspecting, and evaluating infospaces.
"""

from __future__ import annotations

from pathlib import Path
from typing import Optional

import click

from markitect.infospace.config import (
    DisciplineBinding,
    InfospaceConfig,
    SchemaRegistry,
    TopicConfig,
    find_infospace_config,
    load_infospace_config,
    save_infospace_config,
)
from markitect.infospace.entity_parser import parse_entity_directory
from markitect.infospace.state import build_state


def _load_config_or_exit(config_path: Optional[str] = None) -> tuple:
    """Resolve and load infospace.yaml, or exit with an error."""
    if config_path:
        p = Path(config_path)
    else:
        p = find_infospace_config()
    if p is None:
        click.echo("Error: No infospace.yaml found. Run 'markitect infospace init' first.", err=True)
        raise SystemExit(1)
    cfg = load_infospace_config(p)
    return cfg, p


@click.group(name="infospace")
def infospace_commands():
    """Manage infospaces — create, inspect, evaluate."""
    pass


# ── init ─────────────────────────────────────────────────────────────


@infospace_commands.command()
@click.option("--topic", required=True, help="Topic name for the infospace.")
@click.option("--domain", default="", help="Knowledge domain.")
@click.option("--sources", default="", help="Path to source material directory.")
@click.option("--discipline", multiple=True, help="Discipline name (repeatable).")
@click.option("--output", "-o", default="infospace.yaml", help="Output config file path.")
def init(topic: str, domain: str, sources: str, discipline: tuple, output: str):
    """Initialise a new infospace configuration file."""
    out_path = Path(output)
    if out_path.exists():
        click.echo(f"Error: {out_path} already exists.", err=True)
        raise SystemExit(1)

    disciplines = [DisciplineBinding(name=d) for d in discipline]
    config = InfospaceConfig(
        topic=TopicConfig(name=topic, domain=domain, sources=sources),
        disciplines=disciplines,
    )
    save_infospace_config(config, out_path)
    click.echo(f"Created {out_path}")


# ── status ───────────────────────────────────────────────────────────


@infospace_commands.command()
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
def status(config_path: Optional[str]):
    """Show infospace status — entity count, domains, evaluation state."""
    cfg, cfg_path = _load_config_or_exit(config_path)
    root = cfg_path.parent

    # Parse entities
    entities_dir = root / cfg.entities_dir
    entities = []
    if entities_dir.is_dir():
        entities = parse_entity_directory(entities_dir)

    # Load latest snapshot if available
    snapshot = None
    history_path = root / cfg.metrics_dir / "history.yaml"
    if history_path.is_file():
        from markitect.infospace.evaluation_io import read_history
        history = read_history(history_path)
        if history:
            snapshot = history[-1]

    state = build_state(cfg, entities=entities, snapshot=snapshot)

    click.echo(f"Infospace: {state.topic_name}")
    if cfg.topic.domain:
        click.echo(f"Domain:    {cfg.topic.domain}")
    click.echo(f"Entities:  {state.entity_count}")
    if state.domains:
        click.echo(f"Domains:   {', '.join(state.domains)}")
    if cfg.disciplines:
        names = [d.name for d in cfg.disciplines]
        click.echo(f"Disciplines: {', '.join(names)}")
    if state.has_evaluations:
        click.echo(f"Last evaluated: {state.latest_snapshot.created_at.isoformat()}")
    else:
        click.echo("Evaluations: none")


# ── entities ─────────────────────────────────────────────────────────


@infospace_commands.command()
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
@click.option(
    "--sort-by", "sort_key",
    type=click.Choice(["slug", "domain", "words"]),
    default="slug",
    help="Sort entities by field.",
)
def entities(config_path: Optional[str], sort_key: str):
    """List entities with metadata summary."""
    cfg, cfg_path = _load_config_or_exit(config_path)
    root = cfg_path.parent
    entities_dir = root / cfg.entities_dir

    if not entities_dir.is_dir():
        click.echo("No entities directory found.")
        return

    entity_list = parse_entity_directory(entities_dir)
    if not entity_list:
        click.echo("No entities found.")
        return

    # Sort
    if sort_key == "domain":
        entity_list.sort(key=lambda e: (e.domain or "", e.slug))
    elif sort_key == "words":
        entity_list.sort(key=lambda e: e.total_word_count, reverse=True)
    else:
        entity_list.sort(key=lambda e: e.slug)

    # Format as table
    click.echo(f"{'Slug':<40} {'Domain':<20} {'Words':>6}")
    click.echo("-" * 68)
    for e in entity_list:
        click.echo(f"{e.slug:<40} {(e.domain or '-'):<20} {e.total_word_count:>6}")
    click.echo(f"\nTotal: {len(entity_list)} entities")


# ── evaluate ─────────────────────────────────────────────────────────


@infospace_commands.command()
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
@click.option("--provider", default="openrouter", help="LLM provider (openrouter, openai, etc.).")
@click.option("--model", default=None, help="LLM model name.")
@click.option("--entity", "entity_slug", default=None, help="Evaluate a single entity by slug.")
@click.option("--chapter", default=None, help="Evaluate entities from a specific chapter.")
def evaluate(config_path, provider, model, entity_slug, chapter):
    """Evaluate entities using LLM-based quality assessment."""
    cfg, cfg_path = _load_config_or_exit(config_path)
    root = cfg_path.parent

    entities_dir = root / cfg.entities_dir
    if not entities_dir.is_dir():
        click.echo("Error: No entities directory found.", err=True)
        raise SystemExit(1)

    entity_list = parse_entity_directory(entities_dir)
    if not entity_list:
        click.echo("No entities to evaluate.")
        return

    # Filter
    if entity_slug:
        entity_list = [e for e in entity_list if e.slug == entity_slug]
        if not entity_list:
            click.echo(f"Error: Entity '{entity_slug}' not found.", err=True)
            raise SystemExit(1)
    elif chapter:
        entity_list = [e for e in entity_list if chapter in e.source_chapter]
        if not entity_list:
            click.echo(f"No entities found for chapter '{chapter}'.")
            return

    # Create adapter
    from markitect.llm import create_adapter
    from markitect.prompts.execution.models import RunConfig
    adapter = create_adapter(provider, model=model)
    run_config = RunConfig(model_name=model or "default", temperature=0.3, max_tokens=2000)

    # Progress callback
    def on_progress(done, total, result):
        status = result.status.upper()
        click.echo(f"  [{done}/{total}] {result.key}: {status}")

    click.echo(f"Evaluating {len(entity_list)} entities via {provider}...")

    from markitect.infospace.evaluate import run_entity_evaluation
    output_dir = root / cfg.evaluations_dir
    summary = run_entity_evaluation(
        config=cfg,
        entities=entity_list,
        adapter=adapter,
        run_config=run_config,
        output_dir=output_dir,
        progress_callback=on_progress,
    )

    click.echo(f"\nDone: {summary.succeeded} succeeded, {summary.failed} failed, {summary.skipped} skipped")
    if summary.total_tokens > 0:
        click.echo(f"Tokens used: {summary.total_tokens}")


# ── viability ────────────────────────────────────────────────────────


@infospace_commands.command()
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
def viability(config_path: Optional[str]):
    """Show viability dashboard — threshold checks and pass/fail."""
    cfg, cfg_path = _load_config_or_exit(config_path)

    if not cfg.viability:
        click.echo("No viability thresholds configured in infospace.yaml.")
        return

    # Try to load latest metrics
    root = cfg_path.parent
    metrics: dict = {}
    metrics_file = root / cfg.metrics_dir / "metrics.yaml"
    if metrics_file.is_file():
        import yaml
        raw = yaml.safe_load(metrics_file.read_text(encoding="utf-8"))
        if isinstance(raw, dict):
            metrics = {k: float(v) for k, v in raw.items() if isinstance(v, (int, float))}

    state = build_state(cfg, metrics=metrics if metrics else None)

    if not state.viability_results:
        click.echo("No metrics available. Run evaluations first.")
        click.echo("\nConfigured thresholds:")
        for name, t in cfg.viability.items():
            bounds = []
            if t.min is not None:
                bounds.append(f"min={t.min}")
            if t.max is not None:
                bounds.append(f"max={t.max}")
            click.echo(f"  {name}: {', '.join(bounds)}")
        return

    click.echo(f"{'Metric':<30} {'Value':>8} {'Threshold':>15} {'Status':>8}")
    click.echo("-" * 63)
    for r in state.viability_results:
        bounds = []
        if r.threshold.min is not None:
            bounds.append(f"min={r.threshold.min}")
        if r.threshold.max is not None:
            bounds.append(f"max={r.threshold.max}")
        status_str = "PASS" if r.passed else "FAIL"
        click.echo(
            f"{r.metric:<30} {r.value:>8.4f} {', '.join(bounds):>15} {status_str:>8}"
        )

    click.echo()
    if state.is_viable:
        click.echo(f"Viable: YES ({state.viability_pass_count}/{state.viability_total_count} thresholds met)")
    else:
        click.echo(f"Viable: NO ({state.viability_pass_count}/{state.viability_total_count} thresholds met)")


# ── check ───────────────────────────────────────────────────────────


@infospace_commands.command()
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
@click.option(
    "--concern", "concerns", multiple=True,
    type=click.Choice(["redundancy", "coverage", "coherence", "consistency", "granularity"]),
    help="Run specific concern(s). Omit to run all five.",
)
@click.option("--json", "as_json", is_flag=True, help="Output results as JSON.")
def check(config_path: Optional[str], concerns: tuple, as_json: bool):
    """Run collection-level quality checks (C1–C5)."""
    cfg, cfg_path = _load_config_or_exit(config_path)
    root = cfg_path.parent

    entities_dir = root / cfg.entities_dir
    if not entities_dir.is_dir():
        click.echo("Error: No entities directory found.", err=True)
        raise SystemExit(1)

    entity_list = parse_entity_directory(entities_dir)
    if not entity_list:
        click.echo("No entities to check.")
        return

    from markitect.infospace.checks import run_all_checks

    checks_list = list(concerns) if concerns else None

    report = run_all_checks(
        entities=entity_list,
        checks=checks_list,
    )

    if as_json:
        import json
        click.echo(json.dumps(report.to_dict(), indent=2))
    else:
        click.echo(f"Collection checks — {len(entity_list)} entities\n")
        d = report.to_dict()
        for concern_name, concern_data in d.items():
            label = concern_data.get("concern", concern_name.upper())
            click.echo(f"  {label} — {concern_name}")
            for k, v in concern_data.items():
                if k == "concern":
                    continue
                click.echo(f"    {k}: {v}")
            click.echo()

    # Show summary metrics
    m = report.metrics()
    if m and not as_json:
        click.echo("Metrics summary:")
        for k, v in sorted(m.items()):
            click.echo(f"  {k}: {v:.4f}")

    # Record to history
    if m:
        from markitect.infospace.history import record_check_results
        snap = record_check_results(report, cfg, root, entity_count=len(entity_list))
        if not as_json:
            click.echo(f"\nRecorded snapshot {snap.snapshot_id}")


# ── history ─────────────────────────────────────────────────────────


@infospace_commands.command()
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
@click.option("--metric", default=None, help="Show trend for a specific metric.")
@click.option("--json", "as_json", is_flag=True, help="Output as JSON.")
def history(config_path: Optional[str], metric: Optional[str], as_json: bool):
    """Show metrics history — snapshots over time."""
    cfg, cfg_path = _load_config_or_exit(config_path)
    root = cfg_path.parent

    from markitect.infospace.history import get_history, metric_trend

    snapshots = get_history(cfg, root)
    if not snapshots:
        click.echo("No history found. Run 'markitect infospace check' first.")
        return

    if metric:
        trend = metric_trend(snapshots, metric)
        if not trend:
            click.echo(f"No data for metric '{metric}'.")
            return
        if as_json:
            import json
            click.echo(json.dumps(trend, indent=2))
        else:
            click.echo(f"Trend: {metric}\n")
            for entry in trend:
                click.echo(f"  {entry['date'][:19]}  {entry['value']:.4f}")
        return

    if as_json:
        import json
        click.echo(json.dumps([s.to_dict() for s in snapshots], indent=2, default=str))
        return

    click.echo(f"History: {len(snapshots)} snapshot(s)\n")
    click.echo(f"{'#':<4} {'Date':<20} {'Entities':>8} {'Metrics':>8}")
    click.echo("-" * 42)
    for i, snap in enumerate(snapshots, 1):
        date_str = snap.created_at.isoformat()[:19]
        n_metrics = len(snap.collection_metrics)
        click.echo(f"{i:<4} {date_str:<20} {snap.entity_count:>8} {n_metrics:>8}")


@infospace_commands.command(name="history-diff")
@click.argument("date_a")
@click.argument("date_b")
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
def history_diff(date_a: str, date_b: str, config_path: Optional[str]):
    """Compare two history snapshots by date (YYYY-MM-DD)."""
    cfg, cfg_path = _load_config_or_exit(config_path)
    root = cfg_path.parent

    from markitect.infospace.history import find_snapshot_by_date, get_history
    from markitect.infospace.evaluation_io import diff_snapshots

    snapshots = get_history(cfg, root)
    if len(snapshots) < 2:
        click.echo("Need at least two snapshots to diff.")
        return

    snap_a = find_snapshot_by_date(snapshots, date_a)
    snap_b = find_snapshot_by_date(snapshots, date_b)

    if snap_a is None:
        click.echo(f"No snapshot found near '{date_a}'.")
        return
    if snap_b is None:
        click.echo(f"No snapshot found near '{date_b}'.")
        return
    if snap_a.snapshot_id == snap_b.snapshot_id:
        click.echo("Both dates resolve to the same snapshot.")
        return

    diff = diff_snapshots(snap_a, snap_b)
    click.echo(diff.summary())


# ── bind-discipline ─────────────────────────────────────────────────


@infospace_commands.command(name="bind-discipline")
@click.argument("discipline_path")
@click.option("--name", required=True, help="Name for the discipline.")
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
def bind_discipline_cmd(discipline_path: str, name: str, config_path: Optional[str]):
    """Bind a discipline infospace to the current infospace."""
    cfg, cfg_path = _load_config_or_exit(config_path)
    root = cfg_path.parent

    from markitect.infospace.composition import bind_discipline

    status = bind_discipline(cfg, name=name, path=discipline_path, root=root)

    if status.error:
        click.echo(f"Error: {status.error}", err=True)
        raise SystemExit(1)

    # Persist updated config
    save_infospace_config(cfg, cfg_path)

    click.echo(f"Bound discipline '{name}' from {discipline_path}")
    click.echo(f"  Entities: {status.entity_count}")
    if status.has_config:
        viable_str = "YES" if status.is_viable else "NO"
        click.echo(f"  Viable: {viable_str}")


# ── disciplines ─────────────────────────────────────────────────────


@infospace_commands.command()
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
def disciplines(config_path: Optional[str]):
    """List bound disciplines and their viability status."""
    cfg, cfg_path = _load_config_or_exit(config_path)
    root = cfg_path.parent

    if not cfg.disciplines:
        click.echo("No disciplines bound.")
        return

    from markitect.infospace.composition import check_discipline_status

    click.echo(f"{'Name':<30} {'Entities':>8} {'Viable':>8} {'Path'}")
    click.echo("-" * 70)
    for binding in cfg.disciplines:
        status = check_discipline_status(binding, root)
        viable_str = "YES" if status.is_viable else ("NO" if status.has_config else "?")
        click.echo(
            f"{status.name:<30} {status.entity_count:>8} {viable_str:>8} {status.path}"
        )
        if status.error:
            click.echo(f"  Error: {status.error}")


# ── process ─────────────────────────────────────────────────────


@infospace_commands.command()
@click.argument("glob_pattern", default=None, required=False)
@click.option("--all", "process_all", is_flag=True, help="Process all source files.")
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
@click.option("--provider", default=None, help="LLM provider (openrouter, openai, etc.).")
@click.option("--model", default=None, help="LLM model name.")
@click.option(
    "--check-after-each",
    is_flag=True,
    help="Run collection checks (C1–C5) after each source file.",
)
@click.option("--no-commit", is_flag=True, help="Skip git commits.")
def process(
    glob_pattern: Optional[str],
    process_all: bool,
    config_path: Optional[str],
    provider: Optional[str],
    model: Optional[str],
    check_after_each: bool,
    no_commit: bool,
):
    """Process source files through the pipeline defined in infospace.yaml.

    GLOB_PATTERN is matched against the sources directory declared in
    infospace.yaml (default ``*.md``).  Use ``--all`` to process every
    source file.

    \b
    Examples:
      # Process chapters 1-3 from book 1
      markitect infospace process "book-1-chapter-0[1-3].md" --provider openrouter

      # Process all source files and check metrics after each
      markitect infospace process --all --provider openrouter --check-after-each

      # Dry run — load existing outputs only, no LLM calls
      markitect infospace process --all
    """
    cfg, cfg_path = _load_config_or_exit(config_path)
    root = cfg_path.parent

    if not cfg.pipeline or not cfg.pipeline.stages:
        click.echo(
            "Error: No pipeline stages defined in infospace.yaml.\n"
            "Add a 'pipeline.stages' section with at least one stage.",
            err=True,
        )
        raise SystemExit(1)

    # Resolve sources directory
    sources_dir = root / cfg.topic.sources if cfg.topic.sources else root
    if not sources_dir.is_dir():
        click.echo(
            f"Error: Sources directory not found: {sources_dir}\n"
            f"Set 'topic.sources' in infospace.yaml.",
            err=True,
        )
        raise SystemExit(1)

    # Collect source files
    if process_all:
        source_files = sorted(sources_dir.glob("*.md"))
    else:
        pattern = glob_pattern or "*.md"
        source_files = sorted(sources_dir.glob(pattern))

    if not source_files:
        if process_all:
            click.echo(f"No source files found in {sources_dir}")
        else:
            click.echo(
                f"No files matched: {glob_pattern or '*.md'}\n"
                f"Sources directory: {sources_dir}"
            )
        return

    click.echo(f"Found {len(source_files)} source file(s) in {sources_dir.name}/")

    # Create LLM adapter
    adapter = None
    if provider:
        from markitect.llm import create_adapter

        _PROVIDER_DEFAULTS = {"openrouter": "arcee-ai/trinity-large-preview:free"}
        resolved_model = model or _PROVIDER_DEFAULTS.get(provider)
        adapter = create_adapter(provider, model=resolved_model)
        click.echo(f"LLM: {provider} ({resolved_model or 'default'})")
    else:
        click.echo("No LLM provider — will use existing outputs only (manual mode).")

    # Run pipeline
    from markitect.infospace.pipeline import SourcePipeline

    pipeline = SourcePipeline(cfg, root, adapter=adapter, no_commit=no_commit)

    total = len(source_files)
    completed = 0
    for i, source_file in enumerate(source_files, 1):
        click.echo(f"\n[{i}/{total}] {source_file.name}")
        success = pipeline.process_source(source_file)
        if success:
            completed += 1
            if check_after_each:
                pipeline.run_collection_check()

    click.echo(f"\nDone: {completed}/{total} source file(s) fully processed.")


# ── stale-mappings ──────────────────────────────────────────────────


@infospace_commands.command(name="stale-mappings")
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
def stale_mappings(config_path: Optional[str]):
    """Check for stale mappings due to discipline changes."""
    cfg, cfg_path = _load_config_or_exit(config_path)
    root = cfg_path.parent

    if not cfg.disciplines:
        click.echo("No disciplines bound — no mappings to check.")
        return

    from markitect.infospace.composition import find_stale_mappings

    # Try to load mapping references from output
    mapping_refs = _load_mapping_references(cfg, root)

    stale = find_stale_mappings(cfg, root, mapping_references=mapping_refs)

    if not stale:
        click.echo("No stale mappings detected.")
        return

    click.echo(f"Found {len(stale)} stale mapping(s):\n")
    for s in stale:
        click.echo(f"  {s.entity_slug} -> {s.discipline_entity}")
        click.echo(f"    {s.reason}")


def _load_mapping_references(
    cfg: InfospaceConfig, root: Path
) -> Optional[dict]:
    """Try to load mapping references from YAML file in output dir."""
    mapping_file = root / cfg.metrics_dir / "mapping-references.yaml"
    if not mapping_file.is_file():
        return None
    import yaml
    data = yaml.safe_load(mapping_file.read_text(encoding="utf-8"))
    if isinstance(data, dict):
        return data
    return None