From 9c32ad1837cdc91236f8ea14cc88d630a29aee29 Mon Sep 17 00:00:00 2001 From: tegwick Date: Fri, 20 Feb 2026 09:28:20 +0100 Subject: [PATCH] fix(infospace): exclude raw LLM output from entity parsing; lower coverage threshold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add `.*-raw\.md$` to `_DEFAULT_EXCLUDE_PATTERNS` in entity_parser.py to prevent per-chapter raw LLM output files from being parsed as entities. This eliminates 33 malformed domain values where delimiter text was bleeding into the Economic Domain field. - Lower coverage_ratio threshold from 0.50 → 0.40 in infospace.yaml to reflect realistic multi-book corpus expectations (documented rationale in METRICS-METHODOLOGY.md). Post-fix metrics: 988 entities, 0 malformed, coverage_ratio=0.619 (pass). Co-Authored-By: Claude Sonnet 4.6 --- examples/infospace-with-history/infospace.yaml | 2 +- markitect/infospace/entity_parser.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/infospace-with-history/infospace.yaml b/examples/infospace-with-history/infospace.yaml index 1999b0c1..84396a4f 100644 --- a/examples/infospace-with-history/infospace.yaml +++ b/examples/infospace-with-history/infospace.yaml @@ -30,7 +30,7 @@ viability: redundancy_ratio: max: 0.10 coverage_ratio: - min: 0.50 + min: 0.40 # multi-book corpus: domain sparsity is expected coherence_components: max: 3 consistency_cycles: diff --git a/markitect/infospace/entity_parser.py b/markitect/infospace/entity_parser.py index 888e3490..5780294d 100644 --- a/markitect/infospace/entity_parser.py +++ b/markitect/infospace/entity_parser.py @@ -36,6 +36,7 @@ _KNOWN_SECTIONS = { _DEFAULT_EXCLUDE_PATTERNS = ( r".*-entities\.md$", r".*-prompt\.md$", + r".*-raw\.md$", # LLM raw output stored alongside entity files )