markitect-main/tests/unit/infospace/test_entity_parser.py

"""Tests for markitect.infospace.entity_parser and EntityMeta."""

import logging
from pathlib import Path

import pytest

from markitect.infospace import EntityMeta, parse_entity_file, parse_entity_directory


# ── Fixtures ────────────────────────────────────────────────────────

COMPLETE_ENTITY = """\
# Division of Labour

## Definition

The separation of a work process into a number of distinct tasks, each performed
by a specialised worker, resulting in a significant increase in the productive
powers of labour.

## Source Chapter

Book I, Chapter 1: "Of the Division of Labour"

## Context

The division of labour is the central argument of the chapter.

## Economic Domain

Production

## Smith's Original Wording

"The greatest improvements in the productive powers of labour…"

## Modern Interpretation

The division of labour remains a foundational concept in economics.
"""

MINIMAL_ENTITY = """\
# Minimal Entity

## Definition

A brief definition.

## Source Chapter

Book I, Chapter 1

## Context

Some context.

## Economic Domain

Exchange
"""

SLUG_H1_ENTITY = """\
# effectual-demand

## Definition

Effectual demand is the demand by consumers who are willing and able to pay.

## Source Chapter

Book 1, Chapter 7

## Context

Context for effectual demand.

## Economic Domain

Exchange

## Smith's Original Wording

"Such people may be called the effectual demanders…"

## Modern Interpretation

Represents the intersection of desire and purchasing power.
"""

NO_H1 = """\
## Only H2

Some content.
"""


# ── parse_entity_file ────────────────────────────────────────────────

class TestParseEntityFile:
    def test_complete_entity(self, tmp_path):
        f = tmp_path / "division-of-labour.md"
        f.write_text(COMPLETE_ENTITY)
        meta = parse_entity_file(f)

        assert meta.slug == "division_of_labour"
        assert meta.title == "Division of Labour"
        assert meta.h1_is_title_case is True
        assert meta.has_original_wording is True
        assert meta.domain == "Production"
        assert meta.definition_word_count > 20
        assert "separation" in meta.definition.lower()
        assert meta.source_path == str(f)
        assert "definition" in meta.section_slugs
        assert "smith_s_original_wording" in meta.section_slugs

    def test_minimal_entity(self, tmp_path):
        f = tmp_path / "minimal-entity.md"
        f.write_text(MINIMAL_ENTITY)
        meta = parse_entity_file(f)

        assert meta.slug == "minimal_entity"
        assert meta.has_original_wording is False
        assert meta.original_wording == ""
        assert meta.modern_interpretation == ""
        assert meta.domain == "Exchange"

    def test_slug_format_h1(self, tmp_path):
        f = tmp_path / "effectual-demand.md"
        f.write_text(SLUG_H1_ENTITY)
        meta = parse_entity_file(f)

        assert meta.h1_raw == "effectual-demand"
        assert meta.h1_is_title_case is False
        assert meta.slug == "effectual_demand"
        assert meta.has_original_wording is True

    def test_missing_h1_raises(self, tmp_path):
        f = tmp_path / "no-h1.md"
        f.write_text(NO_H1)
        with pytest.raises(ValueError, match="No H1"):
            parse_entity_file(f)

    def test_missing_sections_return_empty(self, tmp_path):
        f = tmp_path / "minimal.md"
        f.write_text(MINIMAL_ENTITY)
        meta = parse_entity_file(f)

        # Optional sections not present → empty string
        assert meta.original_wording == ""
        assert meta.modern_interpretation == ""

    def test_word_count_accuracy(self, tmp_path):
        f = tmp_path / "test.md"
        f.write_text("# Test\n\n## Definition\n\none two three four five\n")
        meta = parse_entity_file(f)
        assert meta.definition_word_count == 5


# ── parse_entity_directory ──────────────────────────────────────────

class TestParseEntityDirectory:
    def _make_dir(self, tmp_path):
        """Create a temporary entity directory."""
        d = tmp_path / "entities"
        d.mkdir()
        (d / "entity-a.md").write_text(COMPLETE_ENTITY)
        (d / "entity-b.md").write_text(MINIMAL_ENTITY)
        # files that should be excluded by default
        (d / "book-1-chapter-01-entities.md").write_text("# View\n\nview file")
        (d / "book-1-chapter-01-prompt.md").write_text("# Prompt\n\nprompt file")
        return d

    def test_excludes_view_and_prompt(self, tmp_path):
        d = self._make_dir(tmp_path)
        results = parse_entity_directory(d)
        slugs = {e.slug for e in results}

        assert "division_of_labour" in slugs
        assert "minimal_entity" in slugs
        # Excluded files should not be parsed as entities
        assert len(results) == 2

    def test_custom_exclude_patterns(self, tmp_path):
        d = self._make_dir(tmp_path)
        # Only exclude prompt files, allow entity views
        results = parse_entity_directory(d, exclude_patterns=[r".*-prompt\.md$"])
        assert len(results) == 3  # entity-a, entity-b, chapter-01-entities

    def test_malformed_skipped_with_warning(self, tmp_path, caplog):
        d = tmp_path / "entities"
        d.mkdir()
        (d / "good.md").write_text(COMPLETE_ENTITY)
        (d / "bad.md").write_text(NO_H1)

        with caplog.at_level(logging.WARNING):
            results = parse_entity_directory(d)

        assert len(results) == 1
        assert "bad.md" in caplog.text


# ── EntityMeta round-trip ───────────────────────────────────────────

class TestEntityMetaRoundTrip:
    def test_to_dict_from_dict(self, tmp_path):
        f = tmp_path / "entity.md"
        f.write_text(COMPLETE_ENTITY)
        original = parse_entity_file(f)

        data = original.to_dict()
        restored = EntityMeta.from_dict(data)

        assert restored.slug == original.slug
        assert restored.title == original.title
        assert restored.definition == original.definition
        assert restored.h1_is_title_case == original.h1_is_title_case
        assert restored.section_slugs == original.section_slugs
        assert restored.definition_word_count == original.definition_word_count

    def test_from_dict_ignores_unknown_keys(self):
        data = {
            "slug": "test",
            "title": "Test",
            "h1_raw": "Test",
            "unknown_field": "should be ignored",
        }
        meta = EntityMeta.from_dict(data)
        assert meta.slug == "test"
        assert not hasattr(meta, "unknown_field") or "unknown_field" not in meta.__dict__