epub3 inbound filter

This commit is contained in:
2026-05-14 22:46:51 +02:00
parent 8d62b2d241
commit 925b36521d
7 changed files with 971 additions and 2 deletions

View File

@@ -1,3 +1,22 @@
# repo-seed
# markitect-filter
A git repository template to bootstrap coulomb projects from.
`markitect-filter` provides concrete source-format adapters for converting
external document formats into canonical Markitect Markdown representations.
The first adapter is `source.epub3`, a read-only EPUB3 adapter that implements
the `markitect-tool` source adapter contract.
## Development
Run tests from this checkout:
```bash
PYTHONPATH=src:/home/worsch/markitect-tool/src python3 -m pytest
```
The EPUB3 adapter is registered through:
```toml
[project.entry-points."markitect_tool.source_adapters"]
epub3 = "markitect_filter.adapters:epub3_adapter_descriptor"
```

30
pyproject.toml Normal file
View File

@@ -0,0 +1,30 @@
[build-system]
requires = ["setuptools>=69"]
build-backend = "setuptools.build_meta"
[project]
name = "markitect-filter"
version = "0.1.0"
description = "Source-format adapters for Markitect normalized Markdown"
readme = "README.md"
requires-python = ">=3.12"
license = { text = "MIT" }
dependencies = [
"markitect-tool @ file:///home/worsch/markitect-tool",
]
[project.optional-dependencies]
dev = [
"pytest>=8",
]
epub3 = []
[project.entry-points."markitect_tool.source_adapters"]
epub3 = "markitect_filter.adapters:epub3_adapter_descriptor"
[tool.setuptools.packages.find]
where = ["src"]
[tool.pytest.ini_options]
testpaths = ["tests"]
pythonpath = ["src", "../markitect-tool/src"]

View File

@@ -0,0 +1,5 @@
"""Concrete source-format adapters for Markitect."""
from markitect_filter.adapters import epub3_adapter_descriptor
__all__ = ["epub3_adapter_descriptor"]

View File

@@ -0,0 +1,51 @@
"""Adapter descriptors exposed through Markitect entry points."""
from __future__ import annotations
from markitect_tool.source import SourceAdapterDescriptor
def epub3_adapter_descriptor() -> SourceAdapterDescriptor:
"""Return the lightweight EPUB3 read adapter descriptor."""
def factory():
from markitect_filter.epub3 import Epub3ReadAdapter
return Epub3ReadAdapter()
return SourceAdapterDescriptor(
id="source.epub3",
version="1",
name="EPUB3",
summary="Read EPUB3 packages into canonical Markitect Markdown.",
operations=["read"],
media_types=["application/epub+zip"],
extensions=[".epub"],
factory=factory,
option_schema={
"type": "object",
"properties": {
"skip_boilerplate": {
"type": "boolean",
"default": True,
"description": "Skip cover, nav, toc, headers, footers, and similar boilerplate when detected.",
}
},
"additionalProperties": False,
},
safety={
"reads_files": True,
"writes_files": False,
"network": False,
"external_process": False,
},
quality_profile={
"text_extraction": "stdlib-xhtml",
"images": "metadata-only",
"styles": "ignored",
},
metadata={
"format": "EPUB3",
"dependency_profile": "stdlib",
},
)

View File

@@ -0,0 +1,602 @@
"""EPUB3 read adapter implementation."""
from __future__ import annotations
import posixpath
import re
import zipfile
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from xml.etree import ElementTree as ET
from markitect_tool.diagnostics import Diagnostic, SourceLocation, has_error
from markitect_tool.source import (
NormalizationQuality,
NormalizedMarkdownDocument,
NormalizedMarkdownSegment,
SourceAdapterMatch,
SourceAdapterMatchRequest,
SourceAsset,
SourceInspectRequest,
SourceInspectResult,
SourceMetadata,
SourceProvenance,
SourceReadRequest,
SourceReadResult,
normalization_cache_key,
)
from markitect_filter.adapters import epub3_adapter_descriptor
XHTML_MEDIA_TYPES = {
"application/xhtml+xml",
"text/html",
}
BOILERPLATE_HINTS = {
"cover",
"nav",
"toc",
"table-of-contents",
"titlepage",
"copyright",
"license",
"transcriber",
"header",
"footer",
}
@dataclass(frozen=True)
class EpubPackage:
rootfile_path: str
metadata: SourceMetadata
manifest: dict[str, dict[str, str]]
spine: list[str]
nav_labels: dict[str, str]
diagnostics: list[Diagnostic]
@dataclass(frozen=True)
class ExtractedSegment:
segment: NormalizedMarkdownSegment
diagnostics: list[Diagnostic]
class Epub3ReadAdapter:
"""Read EPUB3 packages into normalized Markitect Markdown."""
def __init__(self) -> None:
self.descriptor = epub3_adapter_descriptor()
def can_read(self, request: SourceAdapterMatchRequest) -> SourceAdapterMatch:
asset = request.asset
if asset.media_type == "application/epub+zip":
return SourceAdapterMatch(
adapter_id=self.descriptor.id,
matched=True,
confidence=100,
reason="media_type",
)
if asset.extension == ".epub":
return SourceAdapterMatch(
adapter_id=self.descriptor.id,
matched=True,
confidence=80,
reason="extension",
)
return SourceAdapterMatch(
adapter_id=self.descriptor.id,
matched=False,
confidence=0,
reason="unsupported",
)
def inspect(self, request: SourceInspectRequest) -> SourceInspectResult:
package = _load_package(request.asset)
diagnostics = package.diagnostics if package else [_malformed(request.asset, "Unable to read EPUB package.")]
metadata = package.metadata if package else SourceMetadata()
return SourceInspectResult(
asset=request.asset,
adapter=_adapter_info(request.options),
metadata=metadata,
capabilities=["read"],
quality=NormalizationQuality(
lossiness="unknown" if has_error(diagnostics) else "low",
confidence=0.9 if not has_error(diagnostics) else 0.0,
warnings=_warning_count(diagnostics),
),
diagnostics=diagnostics,
valid=not has_error(diagnostics),
)
def read(self, request: SourceReadRequest) -> SourceReadResult:
package = _load_package(request.asset)
if package is None or has_error(package.diagnostics):
diagnostics = package.diagnostics if package else [_malformed(request.asset, "Unable to read EPUB package.")]
return SourceReadResult(diagnostics=diagnostics, valid=False)
diagnostics = list(package.diagnostics)
extracted: list[ExtractedSegment] = []
skip_boilerplate = bool(request.options.get("skip_boilerplate", True))
try:
with zipfile.ZipFile(Path(request.asset.path or request.asset.uri)) as archive:
for order, item_id in enumerate(package.spine):
item = package.manifest.get(item_id)
if item is None:
diagnostics.append(
_warning(
request.asset,
"source.epub3.missing_spine_item",
f"Spine item `{item_id}` is missing from the manifest.",
details={"idref": item_id},
)
)
continue
media_type = item.get("media_type", "")
href = item.get("href", "")
package_path = _resolve_package_path(package.rootfile_path, href)
if media_type not in XHTML_MEDIA_TYPES:
diagnostics.append(
_warning(
request.asset,
"source.epub3.unsupported_media",
f"Skipped unsupported spine media type `{media_type}`.",
details={"href": href, "media_type": media_type},
)
)
continue
if skip_boilerplate and _is_boilerplate(item):
diagnostics.append(
_warning(
request.asset,
"source.epub3.skipped_boilerplate",
f"Skipped boilerplate spine item `{href}`.",
details={"href": href},
)
)
continue
extracted.append(
_extract_segment(
archive,
request.asset,
package_path,
href,
order=len(extracted),
nav_label=package.nav_labels.get(href)
or package.nav_labels.get(package_path),
)
)
except (OSError, zipfile.BadZipFile) as exc:
return SourceReadResult(
diagnostics=[_malformed(request.asset, "EPUB is not a readable ZIP package.", {"error": str(exc)})],
valid=False,
)
segments = [item.segment for item in extracted if item.segment.markdown.strip()]
for item in extracted:
diagnostics.extend(item.diagnostics)
if not segments:
diagnostics.append(_malformed(request.asset, "EPUB did not produce any Markdown segments."))
return SourceReadResult(diagnostics=diagnostics, valid=False)
markdown = "\n\n".join(segment.markdown.strip() for segment in segments)
quality = NormalizationQuality(
lossiness="low" if _warning_count(diagnostics) else "none",
confidence=0.9 if not has_error(diagnostics) else 0.0,
skipped_items=sum(1 for diagnostic in diagnostics if diagnostic.code == "source.epub3.skipped_boilerplate"),
warnings=_warning_count(diagnostics),
metadata={"extraction": "epub3-stdlib-xhtml"},
)
adapter = _adapter_info(request.options)
document = NormalizedMarkdownDocument(
document_id=_document_id(request.asset, package.metadata),
asset=request.asset,
metadata=package.metadata,
markdown=markdown,
segments=segments,
quality=quality,
diagnostics=diagnostics,
provenance=[
SourceProvenance(
source_uri=request.asset.uri,
source_path=request.asset.path,
digest=request.asset.digest,
metadata={"rootfile": package.rootfile_path},
)
],
adapter=adapter,
cache_key=normalization_cache_key(
asset=request.asset,
adapter_id=self.descriptor.id,
adapter_version=self.descriptor.version,
options=request.options,
),
)
return SourceReadResult(document=document, diagnostics=[], valid=not has_error(diagnostics))
def _load_package(asset: SourceAsset) -> EpubPackage | None:
diagnostics: list[Diagnostic] = []
try:
with zipfile.ZipFile(Path(asset.path or asset.uri)) as archive:
rootfile_path = _read_container(archive, asset, diagnostics)
if not rootfile_path:
return EpubPackage("", SourceMetadata(), {}, [], {}, diagnostics)
package_xml = _read_xml(archive, rootfile_path, asset, diagnostics)
if package_xml is None:
return EpubPackage(rootfile_path, SourceMetadata(), {}, [], {}, diagnostics)
metadata = _extract_metadata(package_xml)
manifest = _extract_manifest(package_xml)
spine = _extract_spine(package_xml)
nav_labels = _extract_nav_labels(archive, rootfile_path, manifest, asset, diagnostics)
if not spine:
diagnostics.append(_malformed(asset, "EPUB package does not declare a spine."))
return EpubPackage(rootfile_path, metadata, manifest, spine, nav_labels, diagnostics)
except (OSError, zipfile.BadZipFile) as exc:
return EpubPackage(
"",
SourceMetadata(),
{},
[],
{},
[_malformed(asset, "EPUB is not a readable ZIP package.", {"error": str(exc)})],
)
def _read_container(
archive: zipfile.ZipFile,
asset: SourceAsset,
diagnostics: list[Diagnostic],
) -> str | None:
container = _read_xml(archive, "META-INF/container.xml", asset, diagnostics)
if container is None:
diagnostics.append(_malformed(asset, "EPUB is missing META-INF/container.xml."))
return None
for element in container.iter():
if _local_name(element.tag) == "rootfile":
full_path = element.attrib.get("full-path")
if full_path:
return full_path
diagnostics.append(_malformed(asset, "EPUB container does not declare a rootfile."))
return None
def _read_xml(
archive: zipfile.ZipFile,
path: str,
asset: SourceAsset,
diagnostics: list[Diagnostic],
) -> ET.Element | None:
try:
return ET.fromstring(archive.read(path))
except KeyError:
diagnostics.append(
_malformed(asset, f"EPUB package entry `{path}` is missing.", {"path": path})
)
except ET.ParseError as exc:
diagnostics.append(
_malformed(asset, f"EPUB XML entry `{path}` is malformed.", {"path": path, "error": str(exc)})
)
return None
def _extract_metadata(package_xml: ET.Element) -> SourceMetadata:
raw: dict[str, Any] = {}
titles: list[str] = []
creators: list[str] = []
identifiers: dict[str, str] = {}
language = None
rights = None
publisher = None
publication_date = None
for element in package_xml.iter():
name = _local_name(element.tag)
text = _clean_text("".join(element.itertext()))
if not text:
continue
if name == "title":
titles.append(text)
elif name == "creator":
creators.append(text)
elif name == "language" and language is None:
language = text
elif name == "rights" and rights is None:
rights = text
elif name == "publisher" and publisher is None:
publisher = text
elif name == "date" and publication_date is None:
publication_date = text
elif name == "identifier":
identifier_key = element.attrib.get("id") or f"identifier-{len(identifiers) + 1}"
identifiers[identifier_key] = text
elif name == "meta":
key = element.attrib.get("property") or element.attrib.get("name")
value = text or element.attrib.get("content")
if key and value:
raw[key] = value
return SourceMetadata(
title=titles[0] if titles else None,
creators=creators,
language=language,
rights=rights,
publication_date=publication_date,
publisher=publisher,
identifiers=identifiers,
raw=raw,
)
def _extract_manifest(package_xml: ET.Element) -> dict[str, dict[str, str]]:
manifest: dict[str, dict[str, str]] = {}
for element in package_xml.iter():
if _local_name(element.tag) != "item":
continue
item_id = element.attrib.get("id")
href = element.attrib.get("href")
if not item_id or not href:
continue
manifest[item_id] = {
"id": item_id,
"href": href,
"media_type": element.attrib.get("media-type", ""),
"properties": element.attrib.get("properties", ""),
}
return manifest
def _extract_spine(package_xml: ET.Element) -> list[str]:
spine: list[str] = []
in_spine = False
for element in package_xml.iter():
name = _local_name(element.tag)
if name == "spine":
in_spine = True
for child in list(element):
if _local_name(child.tag) == "itemref" and child.attrib.get("idref"):
spine.append(child.attrib["idref"])
break
return spine if in_spine else []
def _extract_nav_labels(
archive: zipfile.ZipFile,
rootfile_path: str,
manifest: dict[str, dict[str, str]],
asset: SourceAsset,
diagnostics: list[Diagnostic],
) -> dict[str, str]:
nav_item = next(
(
item
for item in manifest.values()
if "nav" in item.get("properties", "").split()
),
None,
)
if nav_item is None:
return {}
nav_path = _resolve_package_path(rootfile_path, nav_item["href"])
nav_xml = _read_xml(archive, nav_path, asset, diagnostics)
if nav_xml is None:
return {}
labels: dict[str, str] = {}
for element in nav_xml.iter():
if _local_name(element.tag) != "a":
continue
href = element.attrib.get("href")
label = _clean_text("".join(element.itertext()))
if href and label:
labels[href.split("#", 1)[0]] = label
labels[_resolve_package_path(rootfile_path, href.split("#", 1)[0])] = label
return labels
def _extract_segment(
archive: zipfile.ZipFile,
asset: SourceAsset,
package_path: str,
href: str,
*,
order: int,
nav_label: str | None,
) -> ExtractedSegment:
diagnostics: list[Diagnostic] = []
document_xml = _read_xml(archive, package_path, asset, diagnostics)
if document_xml is None:
return ExtractedSegment(
NormalizedMarkdownSegment(
segment_id=f"seg-{order + 1:04d}",
order=order,
markdown="",
),
diagnostics,
)
body = _first_descendant(document_xml, "body")
if body is None:
body = document_xml
anchors = _anchors(body)
blocks = _element_blocks(body)
markdown = "\n\n".join(block for block in blocks if block.strip()).strip()
heading, heading_level = _first_heading(body)
heading = heading or nav_label
segment_id = _segment_id(anchors, order)
provenance = SourceProvenance(
source_uri=asset.uri,
source_path=asset.path,
source_href=href,
package_path=package_path,
anchor=anchors[0] if anchors else None,
section=heading,
)
return ExtractedSegment(
NormalizedMarkdownSegment(
segment_id=segment_id,
order=order,
markdown=markdown,
heading=heading,
heading_level=heading_level,
anchors=anchors,
provenance=[provenance],
metadata={"package_path": package_path, "href": href},
),
diagnostics,
)
def _element_blocks(element: ET.Element) -> list[str]:
blocks: list[str] = []
for child in list(element):
name = _local_name(child.tag)
if name in {"script", "style", "head"}:
continue
if name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
text = _inline_text(child)
if text:
blocks.append(f"{'#' * int(name[1])} {text}")
elif name == "p":
text = _inline_text(child)
if text:
blocks.append(text)
elif name in {"ul", "ol"}:
for item in child:
if _local_name(item.tag) == "li":
text = _inline_text(item)
if text:
blocks.append(f"- {text}")
elif name == "blockquote":
text = _inline_text(child)
if text:
blocks.append("\n".join(f"> {line}" for line in text.splitlines()))
elif name in {"section", "article", "div", "main", "body"}:
blocks.extend(_element_blocks(child))
elif name in {"br", "hr"}:
continue
else:
blocks.extend(_element_blocks(child))
text = _direct_text(child)
if text:
blocks.append(text)
return blocks
def _inline_text(element: ET.Element) -> str:
return _clean_text(" ".join(part for part in element.itertext()))
def _direct_text(element: ET.Element) -> str:
values = [element.text or ""]
for child in list(element):
values.append(child.tail or "")
return _clean_text(" ".join(values))
def _first_heading(element: ET.Element) -> tuple[str | None, int | None]:
for descendant in element.iter():
name = _local_name(descendant.tag)
if name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
text = _inline_text(descendant)
if text:
return text, int(name[1])
return None, None
def _first_descendant(element: ET.Element, local_name: str) -> ET.Element | None:
for descendant in element.iter():
if _local_name(descendant.tag) == local_name:
return descendant
return None
def _anchors(element: ET.Element) -> list[str]:
anchors: list[str] = []
for descendant in element.iter():
value = descendant.attrib.get("id") or descendant.attrib.get("name")
if value and value not in anchors:
anchors.append(value)
return anchors
def _is_boilerplate(item: dict[str, str]) -> bool:
haystack = " ".join(
[
item.get("id", ""),
item.get("href", ""),
item.get("properties", ""),
]
).lower()
return any(hint in haystack for hint in BOILERPLATE_HINTS)
def _resolve_package_path(rootfile_path: str, href: str) -> str:
base = posixpath.dirname(rootfile_path)
return posixpath.normpath(posixpath.join(base, href))
def _segment_id(anchors: list[str], order: int) -> str:
if anchors:
slug = re.sub(r"[^a-z0-9._-]+", "-", anchors[0].lower()).strip("-")
if slug:
return slug
return f"seg-{order + 1:04d}"
def _document_id(asset: SourceAsset, metadata: SourceMetadata) -> str:
identifier = next(iter(metadata.identifiers.values()), None)
if identifier:
return f"source.epub3:{identifier}"
if asset.digest:
return f"source.epub3:{asset.digest.removeprefix('sha256:')}"
return f"source.epub3:{asset.name or asset.uri}"
def _adapter_info(options: dict[str, Any]) -> dict[str, Any]:
return {
"id": "source.epub3",
"version": "1",
"options": options,
}
def _warning(
asset: SourceAsset,
code: str,
message: str,
*,
details: dict[str, Any] | None = None,
) -> Diagnostic:
return Diagnostic(
severity="warning",
code=code,
message=message,
source=SourceLocation(path=asset.path) if asset.path else None,
details=details or {},
)
def _malformed(
asset: SourceAsset,
message: str,
details: dict[str, Any] | None = None,
) -> Diagnostic:
return Diagnostic(
severity="error",
code="source.malformed",
message=message,
source=SourceLocation(path=asset.path) if asset.path else None,
details=details or {},
)
def _warning_count(diagnostics: list[Diagnostic]) -> int:
return sum(1 for diagnostic in diagnostics if diagnostic.severity == "warning")
def _local_name(tag: str) -> str:
return tag.rsplit("}", 1)[-1] if "}" in tag else tag
def _clean_text(text: str) -> str:
cleaned = re.sub(r"\s+", " ", text).strip()
return re.sub(r"\s+([.,;:!?])", r"\1", cleaned)

206
tests/test_epub3_adapter.py Normal file
View File

@@ -0,0 +1,206 @@
from pathlib import Path
import zipfile
from markitect_tool.source import (
SourceAdapterMatchRequest,
SourceAdapterRegistry,
SourceAsset,
SourceInspectRequest,
SourceReadRequest,
discover_source_adapters,
inspect_source,
normalize_source,
)
from markitect_filter.adapters import epub3_adapter_descriptor
class FakeEntryPoint:
name = "epub3"
def load(self):
return epub3_adapter_descriptor
def test_epub3_descriptor_matches_contract():
descriptor = epub3_adapter_descriptor()
assert descriptor.id == "source.epub3"
assert descriptor.operations == ["read"]
assert descriptor.media_types == ["application/epub+zip"]
assert descriptor.extensions == [".epub"]
assert descriptor.safety["network"] is False
assert descriptor.option_schema["properties"]["skip_boilerplate"]["default"] is True
def test_epub3_adapter_matches_epub_assets(tmp_path: Path):
epub_path = _write_epub(tmp_path)
asset = SourceAsset.from_path(epub_path, media_type="application/epub+zip")
adapter = epub3_adapter_descriptor().instantiate()
match = adapter.can_read(SourceAdapterMatchRequest(asset=asset))
assert match.matched
assert match.confidence == 100
def test_epub3_adapter_inspects_metadata(tmp_path: Path):
epub_path = _write_epub(tmp_path)
asset = SourceAsset.from_path(epub_path, media_type="application/epub+zip")
adapter = epub3_adapter_descriptor().instantiate()
result = adapter.inspect(SourceInspectRequest(asset=asset))
assert result.is_valid
assert result.metadata.title == "Test Book"
assert result.metadata.creators == ["Ada Lovelace"]
assert result.metadata.language == "en"
assert result.metadata.identifiers["bookid"] == "urn:test-book"
assert result.quality.lossiness == "low"
def test_epub3_adapter_normalizes_spine_to_markdown(tmp_path: Path):
epub_path = _write_epub(tmp_path)
asset = SourceAsset.from_path(epub_path, media_type="application/epub+zip")
adapter = epub3_adapter_descriptor().instantiate()
result = adapter.read(SourceReadRequest(asset=asset, options={"skip_boilerplate": True}))
assert result.is_valid
assert result.document is not None
assert result.document.document_id == "source.epub3:urn:test-book"
assert result.document.metadata.title == "Test Book"
assert result.document.markdown == (
"# Opening\n\n"
"First paragraph with emphasis.\n\n"
"- First point\n\n"
"- Second point\n\n"
"## Continuation\n\n"
"Second chapter text."
)
assert [segment.segment_id for segment in result.document.segments] == [
"opening",
"continuation",
]
assert result.document.segments[0].provenance[0].package_path == "EPUB/chapter1.xhtml"
assert result.document.quality.lossiness == "none"
def test_markitect_api_can_use_epub3_registry(tmp_path: Path):
epub_path = _write_epub(tmp_path)
registry = SourceAdapterRegistry([epub3_adapter_descriptor()])
inspected = inspect_source(epub_path, registry=registry)
normalized = normalize_source(epub_path, registry=registry)
assert inspected.is_valid
assert inspected.metadata.title == "Test Book"
assert normalized.is_valid
assert normalized.document is not None
assert normalized.document.segments[1].heading == "Continuation"
def test_epub3_adapter_reports_malformed_missing_container(tmp_path: Path):
epub_path = tmp_path / "broken.epub"
with zipfile.ZipFile(epub_path, "w") as archive:
archive.writestr("mimetype", "application/epub+zip")
asset = SourceAsset.from_path(epub_path, media_type="application/epub+zip")
adapter = epub3_adapter_descriptor().instantiate()
result = adapter.read(SourceReadRequest(asset=asset))
assert not result.is_valid
assert result.diagnostics[0].code == "source.malformed"
assert "container.xml" in result.diagnostics[0].message
def test_epub3_entry_point_discovery_shape():
registry = discover_source_adapters([FakeEntryPoint()])
assert registry.get("source.epub3").name == "EPUB3"
def _write_epub(tmp_path: Path) -> Path:
epub_path = tmp_path / "test-book.epub"
with zipfile.ZipFile(epub_path, "w") as archive:
archive.writestr("mimetype", "application/epub+zip")
archive.writestr(
"META-INF/container.xml",
"""<?xml version="1.0" encoding="utf-8"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="EPUB/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>
""",
)
archive.writestr(
"EPUB/content.opf",
"""<?xml version="1.0" encoding="utf-8"?>
<package version="3.0" unique-identifier="bookid" xmlns="http://www.idpf.org/2007/opf">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:identifier id="bookid">urn:test-book</dc:identifier>
<dc:title>Test Book</dc:title>
<dc:creator>Ada Lovelace</dc:creator>
<dc:language>en</dc:language>
<dc:publisher>Markitect Fixtures</dc:publisher>
<dc:date>2026-05-14</dc:date>
</metadata>
<manifest>
<item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
<item id="chapter1" href="chapter1.xhtml" media-type="application/xhtml+xml"/>
<item id="chapter2" href="chapter2.xhtml" media-type="application/xhtml+xml"/>
</manifest>
<spine>
<itemref idref="chapter1"/>
<itemref idref="chapter2"/>
</spine>
</package>
""",
)
archive.writestr(
"EPUB/nav.xhtml",
"""<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<nav epub:type="toc" xmlns:epub="http://www.idpf.org/2007/ops">
<ol>
<li><a href="chapter1.xhtml#opening">Opening</a></li>
<li><a href="chapter2.xhtml#continuation">Continuation</a></li>
</ol>
</nav>
</body>
</html>
""",
)
archive.writestr(
"EPUB/chapter1.xhtml",
"""<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<section id="opening">
<h1>Opening</h1>
<p>First paragraph with <em>emphasis</em>.</p>
<ul>
<li>First point</li>
<li>Second point</li>
</ul>
</section>
</body>
</html>
""",
)
archive.writestr(
"EPUB/chapter2.xhtml",
"""<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<section id="continuation">
<h2>Continuation</h2>
<p>Second chapter text.</p>
</section>
</body>
</html>
""",
)
return epub_path

View File

@@ -0,0 +1,56 @@
---
id: MKTF-WP-0001
type: workplan
title: "EPUB3 Read Adapter"
domain: markitect
status: done
owner: markitect-filter
topic_slug: markitect
planning_priority: complete
planning_order: 10
depends_on_workplans:
- MKTT-WP-0018
created: "2026-05-14"
updated: "2026-05-14"
---
# MKTF-WP-0001: EPUB3 Read Adapter
## Purpose
Implement the first concrete `markitect-filter` source adapter:
`source.epub3`, a read-only EPUB3 adapter that satisfies the
`markitect-tool` source adapter contract.
## Implemented Scope
- Python package scaffold with `pyproject.toml`.
- Entry point group registration:
`markitect_tool.source_adapters`.
- Lightweight `epub3_adapter_descriptor`.
- Stdlib-only EPUB3 package reading with `zipfile` and `ElementTree`.
- `META-INF/container.xml` rootfile discovery.
- OPF metadata, manifest, and spine extraction.
- EPUB nav label extraction.
- XHTML body extraction into ordered Markdown segments.
- Source provenance with package paths, hrefs, anchors, and section labels.
- Structured diagnostics for malformed EPUBs, skipped boilerplate, missing
spine items, unsupported media, and malformed XML.
- Tests for descriptor shape, matching, inspection, normalization, malformed
packages, Markitect API registry use, and entry point shape.
## Non-Goals
- PDF, DOCX, ODT, OCR, or browser extraction.
- Write/export adapters.
- Network fetching.
- Styling-preserving conversion.
- Image extraction beyond future metadata/attachment handling.
## Validation
Run from `markitect-filter`:
```bash
PYTHONPATH=src:/home/worsch/markitect-tool/src python3 -m pytest
```