From 9c32ad1837cdc91236f8ea14cc88d630a29aee29 Mon Sep 17 00:00:00 2001
From: tegwick <bernd.worsch@gmail.com>
Date: Fri, 20 Feb 2026 09:28:20 +0100
Subject: [PATCH] fix(infospace): exclude raw LLM output from entity parsing;
 lower coverage threshold
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add `.*-raw\.md$` to `_DEFAULT_EXCLUDE_PATTERNS` in entity_parser.py to
  prevent per-chapter raw LLM output files from being parsed as entities.
  This eliminates 33 malformed domain values where delimiter text was
  bleeding into the Economic Domain field.
- Lower coverage_ratio threshold from 0.50 → 0.40 in infospace.yaml to
  reflect realistic multi-book corpus expectations (documented rationale
  in METRICS-METHODOLOGY.md).

Post-fix metrics: 988 entities, 0 malformed, coverage_ratio=0.619 (pass).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 examples/infospace-with-history/infospace.yaml | 2 +-
 markitect/infospace/entity_parser.py           | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/infospace-with-history/infospace.yaml b/examples/infospace-with-history/infospace.yaml
index 1999b0c1..84396a4f 100644
--- a/examples/infospace-with-history/infospace.yaml
+++ b/examples/infospace-with-history/infospace.yaml
@@ -30,7 +30,7 @@ viability:
   redundancy_ratio:
     max: 0.10
   coverage_ratio:
-    min: 0.50
+    min: 0.40  # multi-book corpus: domain sparsity is expected
   coherence_components:
     max: 3
   consistency_cycles:
diff --git a/markitect/infospace/entity_parser.py b/markitect/infospace/entity_parser.py
index 888e3490..5780294d 100644
--- a/markitect/infospace/entity_parser.py
+++ b/markitect/infospace/entity_parser.py
@@ -36,6 +36,7 @@ _KNOWN_SECTIONS = {
 _DEFAULT_EXCLUDE_PATTERNS = (
     r".*-entities\.md$",
     r".*-prompt\.md$",
+    r".*-raw\.md$",  # LLM raw output stored alongside entity files
 )