generated from coulomb/repo-seed
IB-WP-0016-T01: spine-aware EPUB3 intake
Parse META-INF/container.xml and the OPF package document, then iterate documents in spine reading order instead of archive-name sort. Classify each spine item (body, cover, nav, toc, header, footer, notes, license, auxiliary) and exclude non-body sections by default; include_non_body=True opts them back in for inspection. Capture OPF book metadata (title, creator, language, subjects, rights, identifier, source_url, modified) onto every chunk and propagate it through source artifact provenance. Preserve the legacy zip-without-OPF fallback for malformed EPUBs. Real Lefevre EPUB now yields 148 body chunks in spine order (was 155 mixed, archive-sorted) with cover=1, header=1, footer=4 detected and dropped. 78 tests pass. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -68,3 +68,24 @@ able to show:
|
|||||||
- selected chapter or chunk filters for smoke runs
|
- selected chapter or chunk filters for smoke runs
|
||||||
- deterministic fixture acceptance on a small Lefevre-like subset
|
- deterministic fixture acceptance on a small Lefevre-like subset
|
||||||
- optional live one-chapter smoke run with explicit provider/model/cost caps
|
- optional live one-chapter smoke run with explicit provider/model/cost caps
|
||||||
|
|
||||||
|
## T01 Result (2026-05-17)
|
||||||
|
|
||||||
|
Spine-aware EPUB3 intake landed. Re-running the local Lefevre EPUB through
|
||||||
|
`normalize_source(...)` now yields:
|
||||||
|
|
||||||
|
- 148 body chunks (default), down from the original 155 mixed chunks
|
||||||
|
- Spine reading order: indices 0..27 in declared order, not archive-name sort
|
||||||
|
- Full OPF metadata on every chunk's `book_metadata`:
|
||||||
|
title, creator, language, subjects, rights, identifier, source_url, modified
|
||||||
|
- Section roles classified across the 154 spine items:
|
||||||
|
`body=148`, `footer=4`, `cover=1`, `header=1`
|
||||||
|
- The four Gutenberg footer/license/notes sections and the `*** START OF…`
|
||||||
|
header section are now excluded from generation input by default and
|
||||||
|
available via `include_non_body=True` for inspection
|
||||||
|
- The legacy zip-without-OPF fallback path is preserved for malformed EPUBs
|
||||||
|
|
||||||
|
The remaining gap is title collapse: all body sections still share the
|
||||||
|
Project Gutenberg page title because chapter headings are not yet read from
|
||||||
|
in-document `<h1>` content. That collapse is T02's scope (chapter-aware
|
||||||
|
chunking and stable IDs from in-document headings).
|
||||||
|
|||||||
@@ -258,6 +258,9 @@ def _register_source_chunks(root: Path, chunks: list[SourceChunk]) -> None:
|
|||||||
"chunk_count": chunk.chunk_count,
|
"chunk_count": chunk.chunk_count,
|
||||||
"imported_at": chunk.imported_at,
|
"imported_at": chunk.imported_at,
|
||||||
"extractor_version": chunk.extractor_version,
|
"extractor_version": chunk.extractor_version,
|
||||||
|
"section_role": chunk.section_role,
|
||||||
|
"spine_index": chunk.spine_index,
|
||||||
|
"book_metadata": dict(chunk.book_metadata),
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -4,21 +4,39 @@ import hashlib
|
|||||||
import html
|
import html
|
||||||
import re
|
import re
|
||||||
import zipfile
|
import zipfile
|
||||||
from dataclasses import asdict, dataclass
|
from dataclasses import asdict, dataclass, field
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
from xml.etree import ElementTree as ET
|
||||||
|
|
||||||
from .errors import InfospaceError
|
from .errors import InfospaceError
|
||||||
from .semantics import slugify
|
from .semantics import slugify
|
||||||
|
|
||||||
EXTRACTOR_VERSION = "generic-source-intake-v1"
|
EXTRACTOR_VERSION = "generic-source-intake-v2"
|
||||||
SUPPORTED_EXTENSIONS = {".md", ".markdown", ".txt", ".html", ".htm", ".epub"}
|
SUPPORTED_EXTENSIONS = {".md", ".markdown", ".txt", ".html", ".htm", ".epub"}
|
||||||
HTML_TITLE_RE = re.compile(r"<title[^>]*>(?P<title>.*?)</title>", re.I | re.S)
|
HTML_TITLE_RE = re.compile(r"<title[^>]*>(?P<title>.*?)</title>", re.I | re.S)
|
||||||
HTML_H1_RE = re.compile(r"<h1[^>]*>(?P<title>.*?)</h1>", re.I | re.S)
|
HTML_H1_RE = re.compile(r"<h1[^>]*>(?P<title>.*?)</h1>", re.I | re.S)
|
||||||
SCRIPT_STYLE_RE = re.compile(r"<(script|style)[^>]*>.*?</\1>", re.I | re.S)
|
SCRIPT_STYLE_RE = re.compile(r"<(script|style)[^>]*>.*?</\1>", re.I | re.S)
|
||||||
TAG_RE = re.compile(r"<[^>]+>")
|
TAG_RE = re.compile(r"<[^>]+>")
|
||||||
|
|
||||||
|
OPF_NS = "http://www.idpf.org/2007/opf"
|
||||||
|
DC_NS = "http://purl.org/dc/elements/1.1/"
|
||||||
|
CONTAINER_NS = "urn:oasis:names:tc:opendocument:xmlns:container"
|
||||||
|
|
||||||
|
SECTION_ROLE_BODY = "body"
|
||||||
|
SECTION_ROLE_COVER = "cover"
|
||||||
|
SECTION_ROLE_NAV = "nav"
|
||||||
|
SECTION_ROLE_TOC = "toc"
|
||||||
|
SECTION_ROLE_HEADER = "header"
|
||||||
|
SECTION_ROLE_FOOTER = "footer"
|
||||||
|
SECTION_ROLE_NOTES = "notes"
|
||||||
|
SECTION_ROLE_LICENSE = "license"
|
||||||
|
SECTION_ROLE_AUXILIARY = "auxiliary"
|
||||||
|
|
||||||
|
PG_START_MARKERS = ("*** START OF THE PROJECT GUTENBERG EBOOK", "*** START OF THIS PROJECT GUTENBERG EBOOK")
|
||||||
|
PG_END_MARKERS = ("*** END OF THE PROJECT GUTENBERG EBOOK", "*** END OF THIS PROJECT GUTENBERG EBOOK")
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class SourceChunk:
|
class SourceChunk:
|
||||||
@@ -32,6 +50,9 @@ class SourceChunk:
|
|||||||
chunk_count: int
|
chunk_count: int
|
||||||
imported_at: str
|
imported_at: str
|
||||||
extractor_version: str = EXTRACTOR_VERSION
|
extractor_version: str = EXTRACTOR_VERSION
|
||||||
|
section_role: str = SECTION_ROLE_BODY
|
||||||
|
spine_index: int | None = None
|
||||||
|
book_metadata: dict = field(default_factory=dict)
|
||||||
|
|
||||||
def to_dict(self) -> dict:
|
def to_dict(self) -> dict:
|
||||||
return asdict(self)
|
return asdict(self)
|
||||||
@@ -44,6 +65,23 @@ class _SourceDocument:
|
|||||||
source_type: str
|
source_type: str
|
||||||
original_path: str
|
original_path: str
|
||||||
base_slug: str
|
base_slug: str
|
||||||
|
section_role: str = SECTION_ROLE_BODY
|
||||||
|
spine_index: int | None = None
|
||||||
|
book_metadata: dict = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class _EpubManifestItem:
|
||||||
|
item_id: str
|
||||||
|
href: str
|
||||||
|
media_type: str
|
||||||
|
properties: frozenset
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class _EpubSpineEntry:
|
||||||
|
item_id: str
|
||||||
|
linear: bool
|
||||||
|
|
||||||
|
|
||||||
def normalize_source(
|
def normalize_source(
|
||||||
@@ -51,6 +89,7 @@ def normalize_source(
|
|||||||
*,
|
*,
|
||||||
max_words: int = 800,
|
max_words: int = 800,
|
||||||
max_chunks: int | None = None,
|
max_chunks: int | None = None,
|
||||||
|
include_non_body: bool = False,
|
||||||
) -> list[SourceChunk]:
|
) -> list[SourceChunk]:
|
||||||
source_path = Path(source)
|
source_path = Path(source)
|
||||||
if not source_path.exists():
|
if not source_path.exists():
|
||||||
@@ -59,7 +98,7 @@ def normalize_source(
|
|||||||
f"Source path does not exist: {source_path}",
|
f"Source path does not exist: {source_path}",
|
||||||
{"source": str(source_path)},
|
{"source": str(source_path)},
|
||||||
)
|
)
|
||||||
documents = list(_iter_documents(source_path))
|
documents = list(_iter_documents(source_path, include_non_body=include_non_body))
|
||||||
if not documents:
|
if not documents:
|
||||||
raise InfospaceError(
|
raise InfospaceError(
|
||||||
"unsupported_source",
|
"unsupported_source",
|
||||||
@@ -91,6 +130,9 @@ def normalize_source(
|
|||||||
chunk_index=index,
|
chunk_index=index,
|
||||||
chunk_count=len(pieces),
|
chunk_count=len(pieces),
|
||||||
imported_at=imported_at,
|
imported_at=imported_at,
|
||||||
|
section_role=document.section_role,
|
||||||
|
spine_index=document.spine_index,
|
||||||
|
book_metadata=dict(document.book_metadata),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if max_chunks is not None and max_chunks > 0 and len(chunks) >= max_chunks:
|
if max_chunks is not None and max_chunks > 0 and len(chunks) >= max_chunks:
|
||||||
@@ -98,11 +140,13 @@ def normalize_source(
|
|||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
def _iter_documents(source_path: Path) -> Iterable[_SourceDocument]:
|
def _iter_documents(
|
||||||
|
source_path: Path, *, include_non_body: bool
|
||||||
|
) -> Iterable[_SourceDocument]:
|
||||||
if source_path.is_dir():
|
if source_path.is_dir():
|
||||||
for path in sorted(source_path.rglob("*")):
|
for path in sorted(source_path.rglob("*")):
|
||||||
if path.is_file() and path.suffix.lower() in SUPPORTED_EXTENSIONS:
|
if path.is_file() and path.suffix.lower() in SUPPORTED_EXTENSIONS:
|
||||||
yield from _iter_documents(path)
|
yield from _iter_documents(path, include_non_body=include_non_body)
|
||||||
return
|
return
|
||||||
|
|
||||||
suffix = source_path.suffix.lower()
|
suffix = source_path.suffix.lower()
|
||||||
@@ -113,7 +157,7 @@ def _iter_documents(source_path: Path) -> Iterable[_SourceDocument]:
|
|||||||
elif suffix in (".html", ".htm"):
|
elif suffix in (".html", ".htm"):
|
||||||
yield _html_document(source_path, source_type="html")
|
yield _html_document(source_path, source_type="html")
|
||||||
elif suffix == ".epub":
|
elif suffix == ".epub":
|
||||||
yield from _epub_documents(source_path)
|
yield from _epub_documents(source_path, include_non_body=include_non_body)
|
||||||
|
|
||||||
|
|
||||||
def _markdown_document(path: Path) -> _SourceDocument:
|
def _markdown_document(path: Path) -> _SourceDocument:
|
||||||
@@ -163,35 +207,18 @@ def _html_document(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _epub_documents(path: Path) -> Iterable[_SourceDocument]:
|
def _epub_documents(
|
||||||
|
path: Path, *, include_non_body: bool
|
||||||
|
) -> Iterable[_SourceDocument]:
|
||||||
try:
|
try:
|
||||||
with zipfile.ZipFile(path) as archive:
|
with zipfile.ZipFile(path) as archive:
|
||||||
names = [
|
opf_path = _resolve_opf_path(archive)
|
||||||
name
|
if opf_path is not None:
|
||||||
for name in sorted(archive.namelist())
|
yield from _epub3_spine_documents(
|
||||||
if Path(name).suffix.lower() in {".html", ".htm", ".xhtml", ".txt", ".md"}
|
archive, path, opf_path, include_non_body=include_non_body
|
||||||
and not name.endswith("/")
|
)
|
||||||
]
|
else:
|
||||||
for name in names:
|
yield from _epub_legacy_documents(archive, path)
|
||||||
raw = archive.read(name).decode("utf-8", errors="replace")
|
|
||||||
pseudo_path = Path(name)
|
|
||||||
if pseudo_path.suffix.lower() in {".txt", ".md"}:
|
|
||||||
title = _markdown_title(raw) or _title_from_path(pseudo_path)
|
|
||||||
markdown = _ensure_h1(_normalize_newlines(raw).strip() + "\n", title)
|
|
||||||
yield _SourceDocument(
|
|
||||||
title=title,
|
|
||||||
markdown=markdown,
|
|
||||||
source_type="epub",
|
|
||||||
original_path=f"{path}!{name}",
|
|
||||||
base_slug=slugify(title) or slugify(pseudo_path.stem) or "source",
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
yield _html_document(
|
|
||||||
pseudo_path,
|
|
||||||
source_type="epub",
|
|
||||||
original_path=f"{path}!{name}",
|
|
||||||
text=raw,
|
|
||||||
)
|
|
||||||
except zipfile.BadZipFile as exc:
|
except zipfile.BadZipFile as exc:
|
||||||
raise InfospaceError(
|
raise InfospaceError(
|
||||||
"invalid_epub_source",
|
"invalid_epub_source",
|
||||||
@@ -200,6 +227,243 @@ def _epub_documents(path: Path) -> Iterable[_SourceDocument]:
|
|||||||
) from exc
|
) from exc
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_opf_path(archive: zipfile.ZipFile) -> str | None:
|
||||||
|
try:
|
||||||
|
raw = archive.read("META-INF/container.xml")
|
||||||
|
except KeyError:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
root = ET.fromstring(raw)
|
||||||
|
except ET.ParseError:
|
||||||
|
return None
|
||||||
|
rootfile = root.find(f"{{{CONTAINER_NS}}}rootfiles/{{{CONTAINER_NS}}}rootfile")
|
||||||
|
if rootfile is None:
|
||||||
|
return None
|
||||||
|
full_path = rootfile.attrib.get("full-path")
|
||||||
|
if not full_path:
|
||||||
|
return None
|
||||||
|
if full_path not in archive.namelist():
|
||||||
|
return None
|
||||||
|
return full_path
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_opf(
|
||||||
|
archive: zipfile.ZipFile, opf_path: str
|
||||||
|
) -> tuple[dict, dict[str, _EpubManifestItem], list[_EpubSpineEntry]]:
|
||||||
|
raw = archive.read(opf_path).decode("utf-8", errors="replace")
|
||||||
|
root = ET.fromstring(raw)
|
||||||
|
metadata = _parse_opf_metadata(root)
|
||||||
|
base = _zip_dirname(opf_path)
|
||||||
|
manifest: dict[str, _EpubManifestItem] = {}
|
||||||
|
for item in root.findall(f"{{{OPF_NS}}}manifest/{{{OPF_NS}}}item"):
|
||||||
|
href = item.attrib.get("href", "")
|
||||||
|
item_id = item.attrib.get("id", "")
|
||||||
|
if not href or not item_id:
|
||||||
|
continue
|
||||||
|
manifest[item_id] = _EpubManifestItem(
|
||||||
|
item_id=item_id,
|
||||||
|
href=_join_zip_path(base, href),
|
||||||
|
media_type=item.attrib.get("media-type", ""),
|
||||||
|
properties=frozenset((item.attrib.get("properties") or "").split()),
|
||||||
|
)
|
||||||
|
spine: list[_EpubSpineEntry] = []
|
||||||
|
for entry in root.findall(f"{{{OPF_NS}}}spine/{{{OPF_NS}}}itemref"):
|
||||||
|
idref = entry.attrib.get("idref")
|
||||||
|
if not idref:
|
||||||
|
continue
|
||||||
|
spine.append(
|
||||||
|
_EpubSpineEntry(
|
||||||
|
item_id=idref,
|
||||||
|
linear=entry.attrib.get("linear", "yes") != "no",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return metadata, manifest, spine
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_opf_metadata(opf_root: ET.Element) -> dict:
|
||||||
|
md = opf_root.find(f"{{{OPF_NS}}}metadata")
|
||||||
|
if md is None:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def _first_text(tag: str) -> str:
|
||||||
|
el = md.find(f"{{{DC_NS}}}{tag}")
|
||||||
|
return _collapse_ws(el.text) if el is not None and el.text else ""
|
||||||
|
|
||||||
|
def _all_text(tag: str) -> list[str]:
|
||||||
|
return [
|
||||||
|
_collapse_ws(el.text)
|
||||||
|
for el in md.findall(f"{{{DC_NS}}}{tag}")
|
||||||
|
if el is not None and el.text
|
||||||
|
]
|
||||||
|
|
||||||
|
out: dict = {}
|
||||||
|
title = _first_text("title")
|
||||||
|
if title:
|
||||||
|
out["title"] = title
|
||||||
|
creators = _all_text("creator")
|
||||||
|
if creators:
|
||||||
|
out["creator"] = creators[0]
|
||||||
|
if len(creators) > 1:
|
||||||
|
out["creators"] = creators
|
||||||
|
language = _first_text("language")
|
||||||
|
if language:
|
||||||
|
out["language"] = language
|
||||||
|
rights = _first_text("rights")
|
||||||
|
if rights:
|
||||||
|
out["rights"] = rights
|
||||||
|
subjects = _all_text("subject")
|
||||||
|
if subjects:
|
||||||
|
out["subjects"] = subjects
|
||||||
|
identifier = _first_text("identifier")
|
||||||
|
if identifier:
|
||||||
|
out["identifier"] = identifier
|
||||||
|
source_url = _first_text("source")
|
||||||
|
if source_url:
|
||||||
|
out["source_url"] = source_url
|
||||||
|
for meta in md.findall(f"{{{OPF_NS}}}meta"):
|
||||||
|
prop = meta.attrib.get("property", "")
|
||||||
|
text = _collapse_ws(meta.text) if meta.text else ""
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
if prop == "dcterms:modified":
|
||||||
|
out["modified"] = text
|
||||||
|
elif prop == "dcterms:source" and "source_url" not in out:
|
||||||
|
out["source_url"] = text
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _epub3_spine_documents(
|
||||||
|
archive: zipfile.ZipFile,
|
||||||
|
source_path: Path,
|
||||||
|
opf_path: str,
|
||||||
|
*,
|
||||||
|
include_non_body: bool,
|
||||||
|
) -> Iterable[_SourceDocument]:
|
||||||
|
metadata, manifest, spine = _parse_opf(archive, opf_path)
|
||||||
|
book_title = metadata.get("title") or _title_from_path(source_path)
|
||||||
|
book_slug = slugify(book_title) or slugify(source_path.stem) or "ebook"
|
||||||
|
for spine_index, entry in enumerate(spine):
|
||||||
|
item = manifest.get(entry.item_id)
|
||||||
|
if item is None or not item.href:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
raw = archive.read(item.href).decode("utf-8", errors="replace")
|
||||||
|
except KeyError:
|
||||||
|
continue
|
||||||
|
role = _classify_section(item, entry, raw)
|
||||||
|
if role != SECTION_ROLE_BODY and not include_non_body:
|
||||||
|
continue
|
||||||
|
suffix = Path(item.href).suffix.lower()
|
||||||
|
if suffix in {".txt", ".md"}:
|
||||||
|
title = _markdown_title(raw) or _title_from_path(Path(item.href))
|
||||||
|
markdown_body = _ensure_h1(_normalize_newlines(raw).strip() + "\n", title)
|
||||||
|
else:
|
||||||
|
title = _html_title(raw) or _title_from_path(Path(item.href))
|
||||||
|
text = _html_to_text(raw)
|
||||||
|
if text.lower().startswith(title.lower()):
|
||||||
|
text = text[len(title) :].strip()
|
||||||
|
markdown_body = f"# {title}\n\n{text}\n"
|
||||||
|
section_slug = (
|
||||||
|
slugify(title)
|
||||||
|
or slugify(Path(item.href).stem)
|
||||||
|
or f"section-{spine_index + 1:03d}"
|
||||||
|
)
|
||||||
|
base_slug = f"{book_slug}-{spine_index + 1:03d}-{section_slug}"
|
||||||
|
yield _SourceDocument(
|
||||||
|
title=title,
|
||||||
|
markdown=markdown_body,
|
||||||
|
source_type="epub",
|
||||||
|
original_path=f"{source_path}!{item.href}",
|
||||||
|
base_slug=base_slug,
|
||||||
|
section_role=role,
|
||||||
|
spine_index=spine_index,
|
||||||
|
book_metadata=metadata,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _epub_legacy_documents(
|
||||||
|
archive: zipfile.ZipFile, source_path: Path
|
||||||
|
) -> Iterable[_SourceDocument]:
|
||||||
|
names = [
|
||||||
|
name
|
||||||
|
for name in sorted(archive.namelist())
|
||||||
|
if Path(name).suffix.lower() in {".html", ".htm", ".xhtml", ".txt", ".md"}
|
||||||
|
and not name.endswith("/")
|
||||||
|
]
|
||||||
|
for name in names:
|
||||||
|
raw = archive.read(name).decode("utf-8", errors="replace")
|
||||||
|
pseudo_path = Path(name)
|
||||||
|
if pseudo_path.suffix.lower() in {".txt", ".md"}:
|
||||||
|
title = _markdown_title(raw) or _title_from_path(pseudo_path)
|
||||||
|
markdown = _ensure_h1(_normalize_newlines(raw).strip() + "\n", title)
|
||||||
|
yield _SourceDocument(
|
||||||
|
title=title,
|
||||||
|
markdown=markdown,
|
||||||
|
source_type="epub",
|
||||||
|
original_path=f"{source_path}!{name}",
|
||||||
|
base_slug=slugify(title) or slugify(pseudo_path.stem) or "source",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
yield _html_document(
|
||||||
|
pseudo_path,
|
||||||
|
source_type="epub",
|
||||||
|
original_path=f"{source_path}!{name}",
|
||||||
|
text=raw,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _classify_section(
|
||||||
|
item: _EpubManifestItem,
|
||||||
|
spine_entry: _EpubSpineEntry,
|
||||||
|
content: str,
|
||||||
|
) -> str:
|
||||||
|
name = Path(item.href).name.lower()
|
||||||
|
if "nav" in item.properties:
|
||||||
|
return SECTION_ROLE_NAV
|
||||||
|
if "cover-image" in item.properties:
|
||||||
|
return SECTION_ROLE_COVER
|
||||||
|
if name.startswith("cover") or "titlepage" in name:
|
||||||
|
return SECTION_ROLE_COVER
|
||||||
|
doc_title = re.sub(r"[^a-z0-9 ]+", "", _html_title(content).lower()).strip()
|
||||||
|
if doc_title in {"cover", "cover page", "title page", "titlepage"}:
|
||||||
|
return SECTION_ROLE_COVER
|
||||||
|
if name.startswith("nav"):
|
||||||
|
return SECTION_ROLE_NAV
|
||||||
|
if "toc" in name or "contents" in name:
|
||||||
|
return SECTION_ROLE_TOC
|
||||||
|
if "license" in name or "copyright" in name or "rights" in name:
|
||||||
|
return SECTION_ROLE_LICENSE
|
||||||
|
if "transcriber" in name or "notes" in name:
|
||||||
|
return SECTION_ROLE_NOTES
|
||||||
|
upper = content.upper()
|
||||||
|
if any(marker in upper for marker in PG_START_MARKERS):
|
||||||
|
return SECTION_ROLE_HEADER
|
||||||
|
if any(marker in upper for marker in PG_END_MARKERS):
|
||||||
|
return SECTION_ROLE_FOOTER
|
||||||
|
if "pgheader" in name or "pg-header" in name or "gutenberg-header" in name:
|
||||||
|
return SECTION_ROLE_HEADER
|
||||||
|
if "pgfooter" in name or "pg-footer" in name or "gutenberg-footer" in name:
|
||||||
|
return SECTION_ROLE_FOOTER
|
||||||
|
if not spine_entry.linear:
|
||||||
|
return SECTION_ROLE_AUXILIARY
|
||||||
|
return SECTION_ROLE_BODY
|
||||||
|
|
||||||
|
|
||||||
|
def _zip_dirname(zip_path: str) -> str:
|
||||||
|
normalized = zip_path.replace("\\", "/")
|
||||||
|
if "/" not in normalized:
|
||||||
|
return ""
|
||||||
|
return normalized.rsplit("/", 1)[0]
|
||||||
|
|
||||||
|
|
||||||
|
def _join_zip_path(base: str, href: str) -> str:
|
||||||
|
base = base.replace("\\", "/").strip("/")
|
||||||
|
href = href.replace("\\", "/").lstrip("/")
|
||||||
|
if not base or base == ".":
|
||||||
|
return href
|
||||||
|
return f"{base}/{href}"
|
||||||
|
|
||||||
|
|
||||||
def _chunk_markdown(markdown: str, *, max_words: int) -> list[str]:
|
def _chunk_markdown(markdown: str, *, max_words: int) -> list[str]:
|
||||||
text = markdown.strip()
|
text = markdown.strip()
|
||||||
if max_words <= 0:
|
if max_words <= 0:
|
||||||
|
|||||||
173
tests/test_epub3_intake.py
Normal file
173
tests/test_epub3_intake.py
Normal file
@@ -0,0 +1,173 @@
|
|||||||
|
import zipfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from infospace_bench.source_intake import (
|
||||||
|
SECTION_ROLE_BODY,
|
||||||
|
SECTION_ROLE_COVER,
|
||||||
|
SECTION_ROLE_FOOTER,
|
||||||
|
SECTION_ROLE_HEADER,
|
||||||
|
SECTION_ROLE_LICENSE,
|
||||||
|
SECTION_ROLE_NAV,
|
||||||
|
SECTION_ROLE_NOTES,
|
||||||
|
normalize_source,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
CONTAINER_XML = """<?xml version="1.0"?>
|
||||||
|
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
|
||||||
|
<rootfiles>
|
||||||
|
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
|
||||||
|
</rootfiles>
|
||||||
|
</container>
|
||||||
|
"""
|
||||||
|
|
||||||
|
PACKAGE_OPF = """<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="bookid">
|
||||||
|
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||||
|
<dc:identifier id="bookid">urn:gutenberg:60979</dc:identifier>
|
||||||
|
<dc:title>Reminiscences of a Stock Operator</dc:title>
|
||||||
|
<dc:creator>Edwin Lefevre</dc:creator>
|
||||||
|
<dc:language>en</dc:language>
|
||||||
|
<dc:rights>Public domain in the USA.</dc:rights>
|
||||||
|
<dc:subject>Speculation</dc:subject>
|
||||||
|
<dc:subject>New York Stock Exchange</dc:subject>
|
||||||
|
<dc:source>https://www.gutenberg.org/ebooks/60979</dc:source>
|
||||||
|
<meta property="dcterms:modified">2026-05-01T00:00:00Z</meta>
|
||||||
|
</metadata>
|
||||||
|
<manifest>
|
||||||
|
<item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
|
||||||
|
<item id="cover" href="cover.xhtml" media-type="application/xhtml+xml"/>
|
||||||
|
<item id="pgheader" href="pgheader.xhtml" media-type="application/xhtml+xml"/>
|
||||||
|
<item id="ch1" href="chapter1.xhtml" media-type="application/xhtml+xml"/>
|
||||||
|
<item id="ch2" href="chapter2.xhtml" media-type="application/xhtml+xml"/>
|
||||||
|
<item id="notes" href="transcriber-notes.xhtml" media-type="application/xhtml+xml"/>
|
||||||
|
<item id="license" href="license.xhtml" media-type="application/xhtml+xml"/>
|
||||||
|
<item id="pgfooter" href="pgfooter.xhtml" media-type="application/xhtml+xml"/>
|
||||||
|
</manifest>
|
||||||
|
<spine>
|
||||||
|
<itemref idref="cover"/>
|
||||||
|
<itemref idref="nav" linear="no"/>
|
||||||
|
<itemref idref="pgheader"/>
|
||||||
|
<itemref idref="ch1"/>
|
||||||
|
<itemref idref="ch2"/>
|
||||||
|
<itemref idref="notes"/>
|
||||||
|
<itemref idref="license"/>
|
||||||
|
<itemref idref="pgfooter"/>
|
||||||
|
</spine>
|
||||||
|
</package>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def _write_lefevre_epub3_fixture(path: Path) -> None:
|
||||||
|
with zipfile.ZipFile(path, "w") as archive:
|
||||||
|
archive.writestr("mimetype", "application/epub+zip")
|
||||||
|
archive.writestr("META-INF/container.xml", CONTAINER_XML)
|
||||||
|
archive.writestr("OEBPS/content.opf", PACKAGE_OPF)
|
||||||
|
archive.writestr(
|
||||||
|
"OEBPS/nav.xhtml",
|
||||||
|
"<html><head><title>Contents</title></head>"
|
||||||
|
"<body><nav><ol><li>Chapter I</li><li>Chapter II</li></ol></nav></body></html>",
|
||||||
|
)
|
||||||
|
archive.writestr(
|
||||||
|
"OEBPS/cover.xhtml",
|
||||||
|
"<html><head><title>Cover</title></head><body><h1>Cover</h1></body></html>",
|
||||||
|
)
|
||||||
|
archive.writestr(
|
||||||
|
"OEBPS/pgheader.xhtml",
|
||||||
|
"<html><head><title>Reminiscences of a Stock Operator</title></head>"
|
||||||
|
"<body><p>*** START OF THE PROJECT GUTENBERG EBOOK REMINISCENCES ***</p>"
|
||||||
|
"<p>Produced by transcribers.</p></body></html>",
|
||||||
|
)
|
||||||
|
archive.writestr(
|
||||||
|
"OEBPS/chapter1.xhtml",
|
||||||
|
"<html><head><title>Chapter I</title></head>"
|
||||||
|
"<body><h1>Chapter I</h1>"
|
||||||
|
"<p>I went to work when I was just out of grammar school.</p></body></html>",
|
||||||
|
)
|
||||||
|
archive.writestr(
|
||||||
|
"OEBPS/chapter2.xhtml",
|
||||||
|
"<html><head><title>Chapter II</title></head>"
|
||||||
|
"<body><h1>Chapter II</h1>"
|
||||||
|
"<p>I was only fifteen when I made my first thousand dollars.</p></body></html>",
|
||||||
|
)
|
||||||
|
archive.writestr(
|
||||||
|
"OEBPS/transcriber-notes.xhtml",
|
||||||
|
"<html><head><title>Transcriber's Notes</title></head>"
|
||||||
|
"<body><h1>Transcriber's Notes</h1><p>Spelling normalised.</p></body></html>",
|
||||||
|
)
|
||||||
|
archive.writestr(
|
||||||
|
"OEBPS/license.xhtml",
|
||||||
|
"<html><head><title>License</title></head>"
|
||||||
|
"<body><h1>License</h1><p>Project Gutenberg License terms.</p></body></html>",
|
||||||
|
)
|
||||||
|
archive.writestr(
|
||||||
|
"OEBPS/pgfooter.xhtml",
|
||||||
|
"<html><head><title>End</title></head>"
|
||||||
|
"<body><p>*** END OF THE PROJECT GUTENBERG EBOOK REMINISCENCES ***</p></body></html>",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_epub3_intake_follows_spine_order_and_drops_non_body_by_default(tmp_path: Path) -> None:
|
||||||
|
book = tmp_path / "lefevre.epub"
|
||||||
|
_write_lefevre_epub3_fixture(book)
|
||||||
|
|
||||||
|
chunks = normalize_source(book)
|
||||||
|
|
||||||
|
assert [chunk.title for chunk in chunks] == ["Chapter I", "Chapter II"]
|
||||||
|
assert [chunk.spine_index for chunk in chunks] == [3, 4]
|
||||||
|
assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks)
|
||||||
|
assert all(chunk.source_type == "epub" for chunk in chunks)
|
||||||
|
# Chunk IDs must be stable, ordered, and not collapse to the book title.
|
||||||
|
assert chunks[0].chunk_id.startswith("reminiscences-of-a-stock-operator-004-")
|
||||||
|
assert chunks[1].chunk_id.startswith("reminiscences-of-a-stock-operator-005-")
|
||||||
|
assert chunks[0].chunk_id != chunks[1].chunk_id
|
||||||
|
|
||||||
|
|
||||||
|
def test_epub3_intake_captures_book_metadata_provenance(tmp_path: Path) -> None:
|
||||||
|
book = tmp_path / "lefevre.epub"
|
||||||
|
_write_lefevre_epub3_fixture(book)
|
||||||
|
|
||||||
|
chunks = normalize_source(book)
|
||||||
|
|
||||||
|
metadata = chunks[0].book_metadata
|
||||||
|
assert metadata["title"] == "Reminiscences of a Stock Operator"
|
||||||
|
assert metadata["creator"] == "Edwin Lefevre"
|
||||||
|
assert metadata["language"] == "en"
|
||||||
|
assert "Speculation" in metadata["subjects"]
|
||||||
|
assert "New York Stock Exchange" in metadata["subjects"]
|
||||||
|
assert metadata["rights"].startswith("Public domain")
|
||||||
|
assert metadata["identifier"] == "urn:gutenberg:60979"
|
||||||
|
assert metadata["source_url"] == "https://www.gutenberg.org/ebooks/60979"
|
||||||
|
assert metadata["modified"] == "2026-05-01T00:00:00Z"
|
||||||
|
|
||||||
|
|
||||||
|
def test_epub3_intake_tags_non_body_sections_when_included(tmp_path: Path) -> None:
|
||||||
|
book = tmp_path / "lefevre.epub"
|
||||||
|
_write_lefevre_epub3_fixture(book)
|
||||||
|
|
||||||
|
chunks = normalize_source(book, include_non_body=True)
|
||||||
|
|
||||||
|
by_index = {chunk.spine_index: chunk.section_role for chunk in chunks}
|
||||||
|
assert by_index[0] == SECTION_ROLE_COVER
|
||||||
|
assert by_index[1] == SECTION_ROLE_NAV
|
||||||
|
assert by_index[2] == SECTION_ROLE_HEADER
|
||||||
|
assert by_index[3] == SECTION_ROLE_BODY
|
||||||
|
assert by_index[4] == SECTION_ROLE_BODY
|
||||||
|
assert by_index[5] == SECTION_ROLE_NOTES
|
||||||
|
assert by_index[6] == SECTION_ROLE_LICENSE
|
||||||
|
assert by_index[7] == SECTION_ROLE_FOOTER
|
||||||
|
assert [chunk.spine_index for chunk in chunks] == list(range(8))
|
||||||
|
|
||||||
|
|
||||||
|
def test_epub3_intake_falls_back_to_archive_order_for_malformed_epub(tmp_path: Path) -> None:
|
||||||
|
legacy = tmp_path / "legacy.epub"
|
||||||
|
with zipfile.ZipFile(legacy, "w") as archive:
|
||||||
|
archive.writestr("OEBPS/chapter1.xhtml", "<h1>Chapter One</h1><p>Alpha beta.</p>")
|
||||||
|
archive.writestr("OEBPS/chapter2.xhtml", "<h1>Chapter Two</h1><p>Gamma delta.</p>")
|
||||||
|
|
||||||
|
chunks = normalize_source(legacy)
|
||||||
|
|
||||||
|
assert [chunk.title for chunk in chunks] == ["Chapter One", "Chapter Two"]
|
||||||
|
assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks)
|
||||||
|
assert all(chunk.spine_index is None for chunk in chunks)
|
||||||
|
assert all(chunk.book_metadata == {} for chunk in chunks)
|
||||||
@@ -8,7 +8,7 @@ status: active
|
|||||||
owner: markitect
|
owner: markitect
|
||||||
topic_slug: markitect
|
topic_slug: markitect
|
||||||
created: "2026-05-14"
|
created: "2026-05-14"
|
||||||
updated: "2026-05-14"
|
updated: "2026-05-17"
|
||||||
state_hub_workstream_slug: "ib-wp-0016-lefevre-ebook-infospace-readiness"
|
state_hub_workstream_slug: "ib-wp-0016-lefevre-ebook-infospace-readiness"
|
||||||
state_hub_workstream_id: "23be7d20-b01f-4b17-9851-4d540e4c0984"
|
state_hub_workstream_id: "23be7d20-b01f-4b17-9851-4d540e4c0984"
|
||||||
depends_on_workplans:
|
depends_on_workplans:
|
||||||
@@ -81,7 +81,7 @@ run should wait:
|
|||||||
|
|
||||||
```task
|
```task
|
||||||
id: IB-WP-0016-T01
|
id: IB-WP-0016-T01
|
||||||
status: in_progress
|
status: done
|
||||||
priority: high
|
priority: high
|
||||||
state_hub_task_id: "a672fcf9-1b80-4faf-b16d-84ca52601dc9"
|
state_hub_task_id: "a672fcf9-1b80-4faf-b16d-84ca52601dc9"
|
||||||
```
|
```
|
||||||
|
|||||||
Reference in New Issue
Block a user