IB-WP-0016-T01: spine-aware EPUB3 intake

Parse META-INF/container.xml and the OPF package document, then iterate
documents in spine reading order instead of archive-name sort. Classify
each spine item (body, cover, nav, toc, header, footer, notes, license,
auxiliary) and exclude non-body sections by default; include_non_body=True
opts them back in for inspection. Capture OPF book metadata (title,
creator, language, subjects, rights, identifier, source_url, modified)
onto every chunk and propagate it through source artifact provenance.
Preserve the legacy zip-without-OPF fallback for malformed EPUBs.

Real Lefevre EPUB now yields 148 body chunks in spine order (was 155
mixed, archive-sorted) with cover=1, header=1, footer=4 detected and
dropped. 78 tests pass.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-17 13:52:24 +02:00
parent ead2f335f3
commit 5b6a63fb7a
5 changed files with 496 additions and 35 deletions

View File

@@ -68,3 +68,24 @@ able to show:
- selected chapter or chunk filters for smoke runs
- deterministic fixture acceptance on a small Lefevre-like subset
- optional live one-chapter smoke run with explicit provider/model/cost caps
## T01 Result (2026-05-17)
Spine-aware EPUB3 intake landed. Re-running the local Lefevre EPUB through
`normalize_source(...)` now yields:
- 148 body chunks (default), down from the original 155 mixed chunks
- Spine reading order: indices 0..27 in declared order, not archive-name sort
- Full OPF metadata on every chunk's `book_metadata`:
title, creator, language, subjects, rights, identifier, source_url, modified
- Section roles classified across the 154 spine items:
`body=148`, `footer=4`, `cover=1`, `header=1`
- The four Gutenberg footer/license/notes sections and the `*** START OF…`
header section are now excluded from generation input by default and
available via `include_non_body=True` for inspection
- The legacy zip-without-OPF fallback path is preserved for malformed EPUBs
The remaining gap is title collapse: all body sections still share the
Project Gutenberg page title because chapter headings are not yet read from
in-document `<h1>` content. That collapse is T02's scope (chapter-aware
chunking and stable IDs from in-document headings).

View File

@@ -258,6 +258,9 @@ def _register_source_chunks(root: Path, chunks: list[SourceChunk]) -> None:
"chunk_count": chunk.chunk_count,
"imported_at": chunk.imported_at,
"extractor_version": chunk.extractor_version,
"section_role": chunk.section_role,
"spine_index": chunk.spine_index,
"book_metadata": dict(chunk.book_metadata),
},
)

View File

@@ -4,21 +4,39 @@ import hashlib
import html
import re
import zipfile
from dataclasses import asdict, dataclass
from dataclasses import asdict, dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterable
from xml.etree import ElementTree as ET
from .errors import InfospaceError
from .semantics import slugify
EXTRACTOR_VERSION = "generic-source-intake-v1"
EXTRACTOR_VERSION = "generic-source-intake-v2"
SUPPORTED_EXTENSIONS = {".md", ".markdown", ".txt", ".html", ".htm", ".epub"}
HTML_TITLE_RE = re.compile(r"<title[^>]*>(?P<title>.*?)</title>", re.I | re.S)
HTML_H1_RE = re.compile(r"<h1[^>]*>(?P<title>.*?)</h1>", re.I | re.S)
SCRIPT_STYLE_RE = re.compile(r"<(script|style)[^>]*>.*?</\1>", re.I | re.S)
TAG_RE = re.compile(r"<[^>]+>")
OPF_NS = "http://www.idpf.org/2007/opf"
DC_NS = "http://purl.org/dc/elements/1.1/"
CONTAINER_NS = "urn:oasis:names:tc:opendocument:xmlns:container"
SECTION_ROLE_BODY = "body"
SECTION_ROLE_COVER = "cover"
SECTION_ROLE_NAV = "nav"
SECTION_ROLE_TOC = "toc"
SECTION_ROLE_HEADER = "header"
SECTION_ROLE_FOOTER = "footer"
SECTION_ROLE_NOTES = "notes"
SECTION_ROLE_LICENSE = "license"
SECTION_ROLE_AUXILIARY = "auxiliary"
PG_START_MARKERS = ("*** START OF THE PROJECT GUTENBERG EBOOK", "*** START OF THIS PROJECT GUTENBERG EBOOK")
PG_END_MARKERS = ("*** END OF THE PROJECT GUTENBERG EBOOK", "*** END OF THIS PROJECT GUTENBERG EBOOK")
@dataclass(frozen=True)
class SourceChunk:
@@ -32,6 +50,9 @@ class SourceChunk:
chunk_count: int
imported_at: str
extractor_version: str = EXTRACTOR_VERSION
section_role: str = SECTION_ROLE_BODY
spine_index: int | None = None
book_metadata: dict = field(default_factory=dict)
def to_dict(self) -> dict:
return asdict(self)
@@ -44,6 +65,23 @@ class _SourceDocument:
source_type: str
original_path: str
base_slug: str
section_role: str = SECTION_ROLE_BODY
spine_index: int | None = None
book_metadata: dict = field(default_factory=dict)
@dataclass(frozen=True)
class _EpubManifestItem:
item_id: str
href: str
media_type: str
properties: frozenset
@dataclass(frozen=True)
class _EpubSpineEntry:
item_id: str
linear: bool
def normalize_source(
@@ -51,6 +89,7 @@ def normalize_source(
*,
max_words: int = 800,
max_chunks: int | None = None,
include_non_body: bool = False,
) -> list[SourceChunk]:
source_path = Path(source)
if not source_path.exists():
@@ -59,7 +98,7 @@ def normalize_source(
f"Source path does not exist: {source_path}",
{"source": str(source_path)},
)
documents = list(_iter_documents(source_path))
documents = list(_iter_documents(source_path, include_non_body=include_non_body))
if not documents:
raise InfospaceError(
"unsupported_source",
@@ -91,6 +130,9 @@ def normalize_source(
chunk_index=index,
chunk_count=len(pieces),
imported_at=imported_at,
section_role=document.section_role,
spine_index=document.spine_index,
book_metadata=dict(document.book_metadata),
)
)
if max_chunks is not None and max_chunks > 0 and len(chunks) >= max_chunks:
@@ -98,11 +140,13 @@ def normalize_source(
return chunks
def _iter_documents(source_path: Path) -> Iterable[_SourceDocument]:
def _iter_documents(
source_path: Path, *, include_non_body: bool
) -> Iterable[_SourceDocument]:
if source_path.is_dir():
for path in sorted(source_path.rglob("*")):
if path.is_file() and path.suffix.lower() in SUPPORTED_EXTENSIONS:
yield from _iter_documents(path)
yield from _iter_documents(path, include_non_body=include_non_body)
return
suffix = source_path.suffix.lower()
@@ -113,7 +157,7 @@ def _iter_documents(source_path: Path) -> Iterable[_SourceDocument]:
elif suffix in (".html", ".htm"):
yield _html_document(source_path, source_type="html")
elif suffix == ".epub":
yield from _epub_documents(source_path)
yield from _epub_documents(source_path, include_non_body=include_non_body)
def _markdown_document(path: Path) -> _SourceDocument:
@@ -163,35 +207,18 @@ def _html_document(
)
def _epub_documents(path: Path) -> Iterable[_SourceDocument]:
def _epub_documents(
path: Path, *, include_non_body: bool
) -> Iterable[_SourceDocument]:
try:
with zipfile.ZipFile(path) as archive:
names = [
name
for name in sorted(archive.namelist())
if Path(name).suffix.lower() in {".html", ".htm", ".xhtml", ".txt", ".md"}
and not name.endswith("/")
]
for name in names:
raw = archive.read(name).decode("utf-8", errors="replace")
pseudo_path = Path(name)
if pseudo_path.suffix.lower() in {".txt", ".md"}:
title = _markdown_title(raw) or _title_from_path(pseudo_path)
markdown = _ensure_h1(_normalize_newlines(raw).strip() + "\n", title)
yield _SourceDocument(
title=title,
markdown=markdown,
source_type="epub",
original_path=f"{path}!{name}",
base_slug=slugify(title) or slugify(pseudo_path.stem) or "source",
)
else:
yield _html_document(
pseudo_path,
source_type="epub",
original_path=f"{path}!{name}",
text=raw,
)
opf_path = _resolve_opf_path(archive)
if opf_path is not None:
yield from _epub3_spine_documents(
archive, path, opf_path, include_non_body=include_non_body
)
else:
yield from _epub_legacy_documents(archive, path)
except zipfile.BadZipFile as exc:
raise InfospaceError(
"invalid_epub_source",
@@ -200,6 +227,243 @@ def _epub_documents(path: Path) -> Iterable[_SourceDocument]:
) from exc
def _resolve_opf_path(archive: zipfile.ZipFile) -> str | None:
try:
raw = archive.read("META-INF/container.xml")
except KeyError:
return None
try:
root = ET.fromstring(raw)
except ET.ParseError:
return None
rootfile = root.find(f"{{{CONTAINER_NS}}}rootfiles/{{{CONTAINER_NS}}}rootfile")
if rootfile is None:
return None
full_path = rootfile.attrib.get("full-path")
if not full_path:
return None
if full_path not in archive.namelist():
return None
return full_path
def _parse_opf(
archive: zipfile.ZipFile, opf_path: str
) -> tuple[dict, dict[str, _EpubManifestItem], list[_EpubSpineEntry]]:
raw = archive.read(opf_path).decode("utf-8", errors="replace")
root = ET.fromstring(raw)
metadata = _parse_opf_metadata(root)
base = _zip_dirname(opf_path)
manifest: dict[str, _EpubManifestItem] = {}
for item in root.findall(f"{{{OPF_NS}}}manifest/{{{OPF_NS}}}item"):
href = item.attrib.get("href", "")
item_id = item.attrib.get("id", "")
if not href or not item_id:
continue
manifest[item_id] = _EpubManifestItem(
item_id=item_id,
href=_join_zip_path(base, href),
media_type=item.attrib.get("media-type", ""),
properties=frozenset((item.attrib.get("properties") or "").split()),
)
spine: list[_EpubSpineEntry] = []
for entry in root.findall(f"{{{OPF_NS}}}spine/{{{OPF_NS}}}itemref"):
idref = entry.attrib.get("idref")
if not idref:
continue
spine.append(
_EpubSpineEntry(
item_id=idref,
linear=entry.attrib.get("linear", "yes") != "no",
)
)
return metadata, manifest, spine
def _parse_opf_metadata(opf_root: ET.Element) -> dict:
md = opf_root.find(f"{{{OPF_NS}}}metadata")
if md is None:
return {}
def _first_text(tag: str) -> str:
el = md.find(f"{{{DC_NS}}}{tag}")
return _collapse_ws(el.text) if el is not None and el.text else ""
def _all_text(tag: str) -> list[str]:
return [
_collapse_ws(el.text)
for el in md.findall(f"{{{DC_NS}}}{tag}")
if el is not None and el.text
]
out: dict = {}
title = _first_text("title")
if title:
out["title"] = title
creators = _all_text("creator")
if creators:
out["creator"] = creators[0]
if len(creators) > 1:
out["creators"] = creators
language = _first_text("language")
if language:
out["language"] = language
rights = _first_text("rights")
if rights:
out["rights"] = rights
subjects = _all_text("subject")
if subjects:
out["subjects"] = subjects
identifier = _first_text("identifier")
if identifier:
out["identifier"] = identifier
source_url = _first_text("source")
if source_url:
out["source_url"] = source_url
for meta in md.findall(f"{{{OPF_NS}}}meta"):
prop = meta.attrib.get("property", "")
text = _collapse_ws(meta.text) if meta.text else ""
if not text:
continue
if prop == "dcterms:modified":
out["modified"] = text
elif prop == "dcterms:source" and "source_url" not in out:
out["source_url"] = text
return out
def _epub3_spine_documents(
archive: zipfile.ZipFile,
source_path: Path,
opf_path: str,
*,
include_non_body: bool,
) -> Iterable[_SourceDocument]:
metadata, manifest, spine = _parse_opf(archive, opf_path)
book_title = metadata.get("title") or _title_from_path(source_path)
book_slug = slugify(book_title) or slugify(source_path.stem) or "ebook"
for spine_index, entry in enumerate(spine):
item = manifest.get(entry.item_id)
if item is None or not item.href:
continue
try:
raw = archive.read(item.href).decode("utf-8", errors="replace")
except KeyError:
continue
role = _classify_section(item, entry, raw)
if role != SECTION_ROLE_BODY and not include_non_body:
continue
suffix = Path(item.href).suffix.lower()
if suffix in {".txt", ".md"}:
title = _markdown_title(raw) or _title_from_path(Path(item.href))
markdown_body = _ensure_h1(_normalize_newlines(raw).strip() + "\n", title)
else:
title = _html_title(raw) or _title_from_path(Path(item.href))
text = _html_to_text(raw)
if text.lower().startswith(title.lower()):
text = text[len(title) :].strip()
markdown_body = f"# {title}\n\n{text}\n"
section_slug = (
slugify(title)
or slugify(Path(item.href).stem)
or f"section-{spine_index + 1:03d}"
)
base_slug = f"{book_slug}-{spine_index + 1:03d}-{section_slug}"
yield _SourceDocument(
title=title,
markdown=markdown_body,
source_type="epub",
original_path=f"{source_path}!{item.href}",
base_slug=base_slug,
section_role=role,
spine_index=spine_index,
book_metadata=metadata,
)
def _epub_legacy_documents(
archive: zipfile.ZipFile, source_path: Path
) -> Iterable[_SourceDocument]:
names = [
name
for name in sorted(archive.namelist())
if Path(name).suffix.lower() in {".html", ".htm", ".xhtml", ".txt", ".md"}
and not name.endswith("/")
]
for name in names:
raw = archive.read(name).decode("utf-8", errors="replace")
pseudo_path = Path(name)
if pseudo_path.suffix.lower() in {".txt", ".md"}:
title = _markdown_title(raw) or _title_from_path(pseudo_path)
markdown = _ensure_h1(_normalize_newlines(raw).strip() + "\n", title)
yield _SourceDocument(
title=title,
markdown=markdown,
source_type="epub",
original_path=f"{source_path}!{name}",
base_slug=slugify(title) or slugify(pseudo_path.stem) or "source",
)
else:
yield _html_document(
pseudo_path,
source_type="epub",
original_path=f"{source_path}!{name}",
text=raw,
)
def _classify_section(
item: _EpubManifestItem,
spine_entry: _EpubSpineEntry,
content: str,
) -> str:
name = Path(item.href).name.lower()
if "nav" in item.properties:
return SECTION_ROLE_NAV
if "cover-image" in item.properties:
return SECTION_ROLE_COVER
if name.startswith("cover") or "titlepage" in name:
return SECTION_ROLE_COVER
doc_title = re.sub(r"[^a-z0-9 ]+", "", _html_title(content).lower()).strip()
if doc_title in {"cover", "cover page", "title page", "titlepage"}:
return SECTION_ROLE_COVER
if name.startswith("nav"):
return SECTION_ROLE_NAV
if "toc" in name or "contents" in name:
return SECTION_ROLE_TOC
if "license" in name or "copyright" in name or "rights" in name:
return SECTION_ROLE_LICENSE
if "transcriber" in name or "notes" in name:
return SECTION_ROLE_NOTES
upper = content.upper()
if any(marker in upper for marker in PG_START_MARKERS):
return SECTION_ROLE_HEADER
if any(marker in upper for marker in PG_END_MARKERS):
return SECTION_ROLE_FOOTER
if "pgheader" in name or "pg-header" in name or "gutenberg-header" in name:
return SECTION_ROLE_HEADER
if "pgfooter" in name or "pg-footer" in name or "gutenberg-footer" in name:
return SECTION_ROLE_FOOTER
if not spine_entry.linear:
return SECTION_ROLE_AUXILIARY
return SECTION_ROLE_BODY
def _zip_dirname(zip_path: str) -> str:
normalized = zip_path.replace("\\", "/")
if "/" not in normalized:
return ""
return normalized.rsplit("/", 1)[0]
def _join_zip_path(base: str, href: str) -> str:
base = base.replace("\\", "/").strip("/")
href = href.replace("\\", "/").lstrip("/")
if not base or base == ".":
return href
return f"{base}/{href}"
def _chunk_markdown(markdown: str, *, max_words: int) -> list[str]:
text = markdown.strip()
if max_words <= 0:

173
tests/test_epub3_intake.py Normal file
View File

@@ -0,0 +1,173 @@
import zipfile
from pathlib import Path
from infospace_bench.source_intake import (
SECTION_ROLE_BODY,
SECTION_ROLE_COVER,
SECTION_ROLE_FOOTER,
SECTION_ROLE_HEADER,
SECTION_ROLE_LICENSE,
SECTION_ROLE_NAV,
SECTION_ROLE_NOTES,
normalize_source,
)
CONTAINER_XML = """<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>
"""
PACKAGE_OPF = """<?xml version="1.0" encoding="utf-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="bookid">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:identifier id="bookid">urn:gutenberg:60979</dc:identifier>
<dc:title>Reminiscences of a Stock Operator</dc:title>
<dc:creator>Edwin Lefevre</dc:creator>
<dc:language>en</dc:language>
<dc:rights>Public domain in the USA.</dc:rights>
<dc:subject>Speculation</dc:subject>
<dc:subject>New York Stock Exchange</dc:subject>
<dc:source>https://www.gutenberg.org/ebooks/60979</dc:source>
<meta property="dcterms:modified">2026-05-01T00:00:00Z</meta>
</metadata>
<manifest>
<item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
<item id="cover" href="cover.xhtml" media-type="application/xhtml+xml"/>
<item id="pgheader" href="pgheader.xhtml" media-type="application/xhtml+xml"/>
<item id="ch1" href="chapter1.xhtml" media-type="application/xhtml+xml"/>
<item id="ch2" href="chapter2.xhtml" media-type="application/xhtml+xml"/>
<item id="notes" href="transcriber-notes.xhtml" media-type="application/xhtml+xml"/>
<item id="license" href="license.xhtml" media-type="application/xhtml+xml"/>
<item id="pgfooter" href="pgfooter.xhtml" media-type="application/xhtml+xml"/>
</manifest>
<spine>
<itemref idref="cover"/>
<itemref idref="nav" linear="no"/>
<itemref idref="pgheader"/>
<itemref idref="ch1"/>
<itemref idref="ch2"/>
<itemref idref="notes"/>
<itemref idref="license"/>
<itemref idref="pgfooter"/>
</spine>
</package>
"""
def _write_lefevre_epub3_fixture(path: Path) -> None:
with zipfile.ZipFile(path, "w") as archive:
archive.writestr("mimetype", "application/epub+zip")
archive.writestr("META-INF/container.xml", CONTAINER_XML)
archive.writestr("OEBPS/content.opf", PACKAGE_OPF)
archive.writestr(
"OEBPS/nav.xhtml",
"<html><head><title>Contents</title></head>"
"<body><nav><ol><li>Chapter I</li><li>Chapter II</li></ol></nav></body></html>",
)
archive.writestr(
"OEBPS/cover.xhtml",
"<html><head><title>Cover</title></head><body><h1>Cover</h1></body></html>",
)
archive.writestr(
"OEBPS/pgheader.xhtml",
"<html><head><title>Reminiscences of a Stock Operator</title></head>"
"<body><p>*** START OF THE PROJECT GUTENBERG EBOOK REMINISCENCES ***</p>"
"<p>Produced by transcribers.</p></body></html>",
)
archive.writestr(
"OEBPS/chapter1.xhtml",
"<html><head><title>Chapter I</title></head>"
"<body><h1>Chapter I</h1>"
"<p>I went to work when I was just out of grammar school.</p></body></html>",
)
archive.writestr(
"OEBPS/chapter2.xhtml",
"<html><head><title>Chapter II</title></head>"
"<body><h1>Chapter II</h1>"
"<p>I was only fifteen when I made my first thousand dollars.</p></body></html>",
)
archive.writestr(
"OEBPS/transcriber-notes.xhtml",
"<html><head><title>Transcriber's Notes</title></head>"
"<body><h1>Transcriber's Notes</h1><p>Spelling normalised.</p></body></html>",
)
archive.writestr(
"OEBPS/license.xhtml",
"<html><head><title>License</title></head>"
"<body><h1>License</h1><p>Project Gutenberg License terms.</p></body></html>",
)
archive.writestr(
"OEBPS/pgfooter.xhtml",
"<html><head><title>End</title></head>"
"<body><p>*** END OF THE PROJECT GUTENBERG EBOOK REMINISCENCES ***</p></body></html>",
)
def test_epub3_intake_follows_spine_order_and_drops_non_body_by_default(tmp_path: Path) -> None:
book = tmp_path / "lefevre.epub"
_write_lefevre_epub3_fixture(book)
chunks = normalize_source(book)
assert [chunk.title for chunk in chunks] == ["Chapter I", "Chapter II"]
assert [chunk.spine_index for chunk in chunks] == [3, 4]
assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks)
assert all(chunk.source_type == "epub" for chunk in chunks)
# Chunk IDs must be stable, ordered, and not collapse to the book title.
assert chunks[0].chunk_id.startswith("reminiscences-of-a-stock-operator-004-")
assert chunks[1].chunk_id.startswith("reminiscences-of-a-stock-operator-005-")
assert chunks[0].chunk_id != chunks[1].chunk_id
def test_epub3_intake_captures_book_metadata_provenance(tmp_path: Path) -> None:
book = tmp_path / "lefevre.epub"
_write_lefevre_epub3_fixture(book)
chunks = normalize_source(book)
metadata = chunks[0].book_metadata
assert metadata["title"] == "Reminiscences of a Stock Operator"
assert metadata["creator"] == "Edwin Lefevre"
assert metadata["language"] == "en"
assert "Speculation" in metadata["subjects"]
assert "New York Stock Exchange" in metadata["subjects"]
assert metadata["rights"].startswith("Public domain")
assert metadata["identifier"] == "urn:gutenberg:60979"
assert metadata["source_url"] == "https://www.gutenberg.org/ebooks/60979"
assert metadata["modified"] == "2026-05-01T00:00:00Z"
def test_epub3_intake_tags_non_body_sections_when_included(tmp_path: Path) -> None:
book = tmp_path / "lefevre.epub"
_write_lefevre_epub3_fixture(book)
chunks = normalize_source(book, include_non_body=True)
by_index = {chunk.spine_index: chunk.section_role for chunk in chunks}
assert by_index[0] == SECTION_ROLE_COVER
assert by_index[1] == SECTION_ROLE_NAV
assert by_index[2] == SECTION_ROLE_HEADER
assert by_index[3] == SECTION_ROLE_BODY
assert by_index[4] == SECTION_ROLE_BODY
assert by_index[5] == SECTION_ROLE_NOTES
assert by_index[6] == SECTION_ROLE_LICENSE
assert by_index[7] == SECTION_ROLE_FOOTER
assert [chunk.spine_index for chunk in chunks] == list(range(8))
def test_epub3_intake_falls_back_to_archive_order_for_malformed_epub(tmp_path: Path) -> None:
legacy = tmp_path / "legacy.epub"
with zipfile.ZipFile(legacy, "w") as archive:
archive.writestr("OEBPS/chapter1.xhtml", "<h1>Chapter One</h1><p>Alpha beta.</p>")
archive.writestr("OEBPS/chapter2.xhtml", "<h1>Chapter Two</h1><p>Gamma delta.</p>")
chunks = normalize_source(legacy)
assert [chunk.title for chunk in chunks] == ["Chapter One", "Chapter Two"]
assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks)
assert all(chunk.spine_index is None for chunk in chunks)
assert all(chunk.book_metadata == {} for chunk in chunks)

View File

@@ -8,7 +8,7 @@ status: active
owner: markitect
topic_slug: markitect
created: "2026-05-14"
updated: "2026-05-14"
updated: "2026-05-17"
state_hub_workstream_slug: "ib-wp-0016-lefevre-ebook-infospace-readiness"
state_hub_workstream_id: "23be7d20-b01f-4b17-9851-4d540e4c0984"
depends_on_workplans:
@@ -81,7 +81,7 @@ run should wait:
```task
id: IB-WP-0016-T01
status: in_progress
status: done
priority: high
state_hub_task_id: "a672fcf9-1b80-4faf-b16d-84ca52601dc9"
```