Files
infospace-bench/src/infospace_bench/source_intake.py
tegwick b9173b6569 IB-WP-0016-T02: chapter-aware chunking and stable IDs
Resolve chapter labels from EPUB nav entries (when present) and from the
first in-document h1/h2/h3 heading, parse roman-numeral and "Chapter N"
labels into numeric chapter indices, and generate stable IDs of the form
chapter-NN with -part-NNN suffix when a chapter exceeds max_words. The
chunker now operates on cleaned body text, distributes id="Page_*" page
anchors per part via inline markers extracted before splitting, and
supports a configurable overlap_words evidence window between adjacent
parts of the same chapter. Reclassify body sections whose chapter label
matches contents/transcriber-notes/license/colophon tokens so they leave
the body stream by default. Strip <head>...</head> from HTML body
extraction to stop the <title> tag from duplicating heading text in the
chunk markdown.

Real Lefevre EPUB now detects all 24 roman-numeral chapters with stable
chapter-NN IDs, distributes Page_N anchors across multi-part chapters,
and reclassifies Contents and Transcriber's Notes out of body
(role histogram body=67, cover=1, header=1, toc=1, notes=1, footer=2).
82 tests pass.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-17 15:52:47 +02:00

781 lines
26 KiB
Python

from __future__ import annotations
import hashlib
import html
import re
import zipfile
from dataclasses import asdict, dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterable
from xml.etree import ElementTree as ET
from .errors import InfospaceError
from .semantics import slugify
EXTRACTOR_VERSION = "generic-source-intake-v3"
SUPPORTED_EXTENSIONS = {".md", ".markdown", ".txt", ".html", ".htm", ".epub"}
HTML_TITLE_RE = re.compile(r"<title[^>]*>(?P<title>.*?)</title>", re.I | re.S)
HTML_H1_RE = re.compile(r"<h1[^>]*>(?P<title>.*?)</h1>", re.I | re.S)
HTML_FIRST_HEADING_RE = re.compile(
r"<(?P<tag>h[1-6])[^>]*>(?P<title>.*?)</(?P=tag)>", re.I | re.S
)
SCRIPT_STYLE_RE = re.compile(r"<(script|style)[^>]*>.*?</\1>", re.I | re.S)
TAG_RE = re.compile(r"<[^>]+>")
ANCHOR_OPEN_TAG_RE = re.compile(
r"""<(?P<name>[a-zA-Z][a-zA-Z0-9:-]*)[^>]*\bid=(?:"(?P<danchor>(?:Page_|page-|pg-|p-)[^"]*)"|'(?P<sanchor>(?:Page_|page-|pg-|p-)[^']*)')[^>]*>""",
re.I,
)
ANCHOR_MARKER_RE = re.compile(r"⟦anchor:(?P<anchor>[^⟧]+)⟧")
ROMAN_NUMERAL_RE = re.compile(r"^([MDCLXVI]+)\.?$", re.I)
CHAPTER_NUMBER_RE = re.compile(
r"^chapter\s+(?P<value>[ivxlcdm]+|\d+)\b",
re.I,
)
OPF_NS = "http://www.idpf.org/2007/opf"
DC_NS = "http://purl.org/dc/elements/1.1/"
CONTAINER_NS = "urn:oasis:names:tc:opendocument:xmlns:container"
XHTML_NS = "http://www.w3.org/1999/xhtml"
SECTION_ROLE_BODY = "body"
SECTION_ROLE_COVER = "cover"
SECTION_ROLE_NAV = "nav"
SECTION_ROLE_TOC = "toc"
SECTION_ROLE_HEADER = "header"
SECTION_ROLE_FOOTER = "footer"
SECTION_ROLE_NOTES = "notes"
SECTION_ROLE_LICENSE = "license"
SECTION_ROLE_AUXILIARY = "auxiliary"
TOC_LABEL_TOKENS = {"contents", "table of contents", "toc"}
NOTES_LABEL_TOKENS = {
"transcribers notes",
"transcriber notes",
"transcribers note",
"transcribers comments",
"editors notes",
"editor notes",
}
LICENSE_LABEL_TOKENS = {
"license",
"project gutenberg license",
"the project gutenberg license",
"license terms",
"copyright",
"colophon",
}
PG_START_MARKERS = ("*** START OF THE PROJECT GUTENBERG EBOOK", "*** START OF THIS PROJECT GUTENBERG EBOOK")
PG_END_MARKERS = ("*** END OF THE PROJECT GUTENBERG EBOOK", "*** END OF THIS PROJECT GUTENBERG EBOOK")
@dataclass(frozen=True)
class SourceChunk:
chunk_id: str
title: str
markdown: str
source_type: str
original_path: str
digest: str
chunk_index: int
chunk_count: int
imported_at: str
extractor_version: str = EXTRACTOR_VERSION
section_role: str = SECTION_ROLE_BODY
spine_index: int | None = None
book_metadata: dict = field(default_factory=dict)
chapter_label: str | None = None
chapter_number: int | None = None
page_anchors: tuple = ()
def to_dict(self) -> dict:
return asdict(self)
@dataclass(frozen=True)
class _SourceDocument:
title: str
markdown: str
source_type: str
original_path: str
base_slug: str
section_role: str = SECTION_ROLE_BODY
spine_index: int | None = None
book_metadata: dict = field(default_factory=dict)
chapter_label: str | None = None
chapter_number: int | None = None
@dataclass(frozen=True)
class _EpubManifestItem:
item_id: str
href: str
media_type: str
properties: frozenset
@dataclass(frozen=True)
class _EpubSpineEntry:
item_id: str
linear: bool
def normalize_source(
source: str | Path,
*,
max_words: int = 800,
max_chunks: int | None = None,
include_non_body: bool = False,
overlap_words: int = 0,
) -> list[SourceChunk]:
source_path = Path(source)
if not source_path.exists():
raise InfospaceError(
"missing_source",
f"Source path does not exist: {source_path}",
{"source": str(source_path)},
)
documents = list(_iter_documents(source_path, include_non_body=include_non_body))
if not documents:
raise InfospaceError(
"unsupported_source",
f"No supported source documents found: {source_path}",
{
"source": str(source_path),
"supported_extensions": sorted(SUPPORTED_EXTENSIONS),
},
)
imported_at = datetime.now(timezone.utc).isoformat()
chunks: list[SourceChunk] = []
used_ids: set[str] = set()
for document in documents:
pieces = _split_document(document, max_words=max_words, overlap_words=overlap_words)
for index, (part_title, part_markdown, part_anchors) in enumerate(pieces):
base_id = (
document.base_slug
if len(pieces) == 1
else f"{document.base_slug}-part-{index + 1:03d}"
)
chunk_id = _dedupe_chunk_id(base_id, used_ids)
chunks.append(
SourceChunk(
chunk_id=chunk_id,
title=part_title,
markdown=part_markdown,
source_type=document.source_type,
original_path=document.original_path,
digest=_digest_text(part_markdown),
chunk_index=index,
chunk_count=len(pieces),
imported_at=imported_at,
section_role=document.section_role,
spine_index=document.spine_index,
book_metadata=dict(document.book_metadata),
chapter_label=document.chapter_label,
chapter_number=document.chapter_number,
page_anchors=tuple(part_anchors),
)
)
if max_chunks is not None and max_chunks > 0 and len(chunks) >= max_chunks:
return chunks
return chunks
def _iter_documents(
source_path: Path, *, include_non_body: bool
) -> Iterable[_SourceDocument]:
if source_path.is_dir():
for path in sorted(source_path.rglob("*")):
if path.is_file() and path.suffix.lower() in SUPPORTED_EXTENSIONS:
yield from _iter_documents(path, include_non_body=include_non_body)
return
suffix = source_path.suffix.lower()
if suffix in (".md", ".markdown"):
yield _markdown_document(source_path)
elif suffix == ".txt":
yield _text_document(source_path)
elif suffix in (".html", ".htm"):
yield _html_document(source_path, source_type="html")
elif suffix == ".epub":
yield from _epub_documents(source_path, include_non_body=include_non_body)
def _markdown_document(path: Path) -> _SourceDocument:
markdown = _normalize_newlines(path.read_text(encoding="utf-8")).strip() + "\n"
title = _markdown_title(markdown) or _title_from_path(path)
return _SourceDocument(
title=title,
markdown=_ensure_h1(markdown, title),
source_type="markdown",
original_path=str(path),
base_slug=slugify(title) or slugify(path.stem) or "source",
)
def _text_document(path: Path) -> _SourceDocument:
title = _title_from_path(path)
body = _normalize_newlines(path.read_text(encoding="utf-8")).strip()
markdown = f"# {title}\n\n{body}\n"
return _SourceDocument(
title=title,
markdown=markdown,
source_type="text",
original_path=str(path),
base_slug=slugify(title) or "source",
)
def _html_document(
path: Path,
*,
source_type: str,
original_path: str | None = None,
text: str | None = None,
) -> _SourceDocument:
raw = text if text is not None else path.read_text(encoding="utf-8")
title = _html_title(raw) or _title_from_path(path)
body = _html_to_text(raw)
if body.lower().startswith(title.lower()):
body = body[len(title) :].strip()
markdown = f"# {title}\n\n{body}\n"
return _SourceDocument(
title=title,
markdown=markdown,
source_type=source_type,
original_path=original_path or str(path),
base_slug=slugify(title) or slugify(path.stem) or "source",
)
def _epub_documents(
path: Path, *, include_non_body: bool
) -> Iterable[_SourceDocument]:
try:
with zipfile.ZipFile(path) as archive:
opf_path = _resolve_opf_path(archive)
if opf_path is not None:
yield from _epub3_spine_documents(
archive, path, opf_path, include_non_body=include_non_body
)
else:
yield from _epub_legacy_documents(archive, path)
except zipfile.BadZipFile as exc:
raise InfospaceError(
"invalid_epub_source",
f"EPUB source is not a readable zip archive: {path}",
{"source": str(path)},
) from exc
def _resolve_opf_path(archive: zipfile.ZipFile) -> str | None:
try:
raw = archive.read("META-INF/container.xml")
except KeyError:
return None
try:
root = ET.fromstring(raw)
except ET.ParseError:
return None
rootfile = root.find(f"{{{CONTAINER_NS}}}rootfiles/{{{CONTAINER_NS}}}rootfile")
if rootfile is None:
return None
full_path = rootfile.attrib.get("full-path")
if not full_path:
return None
if full_path not in archive.namelist():
return None
return full_path
def _parse_opf(
archive: zipfile.ZipFile, opf_path: str
) -> tuple[dict, dict[str, _EpubManifestItem], list[_EpubSpineEntry]]:
raw = archive.read(opf_path).decode("utf-8", errors="replace")
root = ET.fromstring(raw)
metadata = _parse_opf_metadata(root)
base = _zip_dirname(opf_path)
manifest: dict[str, _EpubManifestItem] = {}
for item in root.findall(f"{{{OPF_NS}}}manifest/{{{OPF_NS}}}item"):
href = item.attrib.get("href", "")
item_id = item.attrib.get("id", "")
if not href or not item_id:
continue
manifest[item_id] = _EpubManifestItem(
item_id=item_id,
href=_join_zip_path(base, href),
media_type=item.attrib.get("media-type", ""),
properties=frozenset((item.attrib.get("properties") or "").split()),
)
spine: list[_EpubSpineEntry] = []
for entry in root.findall(f"{{{OPF_NS}}}spine/{{{OPF_NS}}}itemref"):
idref = entry.attrib.get("idref")
if not idref:
continue
spine.append(
_EpubSpineEntry(
item_id=idref,
linear=entry.attrib.get("linear", "yes") != "no",
)
)
return metadata, manifest, spine
def _parse_opf_metadata(opf_root: ET.Element) -> dict:
md = opf_root.find(f"{{{OPF_NS}}}metadata")
if md is None:
return {}
def _first_text(tag: str) -> str:
el = md.find(f"{{{DC_NS}}}{tag}")
return _collapse_ws(el.text) if el is not None and el.text else ""
def _all_text(tag: str) -> list[str]:
return [
_collapse_ws(el.text)
for el in md.findall(f"{{{DC_NS}}}{tag}")
if el is not None and el.text
]
out: dict = {}
title = _first_text("title")
if title:
out["title"] = title
creators = _all_text("creator")
if creators:
out["creator"] = creators[0]
if len(creators) > 1:
out["creators"] = creators
language = _first_text("language")
if language:
out["language"] = language
rights = _first_text("rights")
if rights:
out["rights"] = rights
subjects = _all_text("subject")
if subjects:
out["subjects"] = subjects
identifier = _first_text("identifier")
if identifier:
out["identifier"] = identifier
source_url = _first_text("source")
if source_url:
out["source_url"] = source_url
for meta in md.findall(f"{{{OPF_NS}}}meta"):
prop = meta.attrib.get("property", "")
text = _collapse_ws(meta.text) if meta.text else ""
if not text:
continue
if prop == "dcterms:modified":
out["modified"] = text
elif prop == "dcterms:source" and "source_url" not in out:
out["source_url"] = text
return out
def _epub3_spine_documents(
archive: zipfile.ZipFile,
source_path: Path,
opf_path: str,
*,
include_non_body: bool,
) -> Iterable[_SourceDocument]:
metadata, manifest, spine = _parse_opf(archive, opf_path)
book_title = metadata.get("title") or _title_from_path(source_path)
book_slug = slugify(book_title) or slugify(source_path.stem) or "ebook"
nav_labels = _load_nav_labels(archive, manifest)
chapter_counter = 0
used_chapter_numbers: set[int] = set()
for spine_index, entry in enumerate(spine):
item = manifest.get(entry.item_id)
if item is None or not item.href:
continue
try:
raw = archive.read(item.href).decode("utf-8", errors="replace")
except KeyError:
continue
role = _classify_section(item, entry, raw)
nav_label = nav_labels.get(item.href, "")
heading_label = _first_heading_text(raw) if Path(item.href).suffix.lower() not in {".txt", ".md"} else ""
chapter_label = nav_label or heading_label or None
# Reclassify body sections whose chapter label matches known noise tokens
if role == SECTION_ROLE_BODY and chapter_label:
normalized_label = re.sub(r"[^a-z0-9 ]+", "", chapter_label.lower()).strip()
if normalized_label in TOC_LABEL_TOKENS:
role = SECTION_ROLE_TOC
elif normalized_label in NOTES_LABEL_TOKENS:
role = SECTION_ROLE_NOTES
elif normalized_label in LICENSE_LABEL_TOKENS:
role = SECTION_ROLE_LICENSE
if role != SECTION_ROLE_BODY and not include_non_body:
continue
chapter_number: int | None = None
if role == SECTION_ROLE_BODY and chapter_label:
chapter_number = _parse_chapter_number(chapter_label)
if role == SECTION_ROLE_BODY and chapter_number is None:
# Fall back to sequential body counter when chapter label exists but
# is not a roman/arabic numeral (e.g. "Preface").
if chapter_label:
chapter_counter += 1
# Use sequential only when no other body has claimed this slot;
# roman-numeral chapters take precedence and may overlap, so we
# leave chapter_number=None for non-numeric labels and let the
# slug fall back to the label slug.
if chapter_number is not None:
if chapter_number in used_chapter_numbers:
# Duplicate numeric label across the book — keep label, drop the
# numeric slot so the slug falls back to label-based naming.
chapter_number = None
else:
used_chapter_numbers.add(chapter_number)
suffix = Path(item.href).suffix.lower()
if suffix in {".txt", ".md"}:
title = chapter_label or _markdown_title(raw) or _title_from_path(Path(item.href))
markdown_body = _ensure_h1(_normalize_newlines(raw).strip() + "\n", title)
else:
title = chapter_label or _html_title(raw) or _title_from_path(Path(item.href))
marked = _inject_anchor_markers(raw)
text = _html_to_text(marked)
if text.lower().startswith(title.lower()):
text = text[len(title) :].strip()
markdown_body = f"# {title}\n\n{text}\n"
base_slug = _chapter_base_slug(
role=role,
chapter_number=chapter_number,
chapter_label=chapter_label,
book_slug=book_slug,
spine_index=spine_index,
href=item.href,
title=title,
)
yield _SourceDocument(
title=title,
markdown=markdown_body,
source_type="epub",
original_path=f"{source_path}!{item.href}",
base_slug=base_slug,
section_role=role,
spine_index=spine_index,
book_metadata=metadata,
chapter_label=chapter_label,
chapter_number=chapter_number,
)
def _chapter_base_slug(
*,
role: str,
chapter_number: int | None,
chapter_label: str | None,
book_slug: str,
spine_index: int,
href: str,
title: str,
) -> str:
if role == SECTION_ROLE_BODY and chapter_number is not None:
return f"chapter-{chapter_number:02d}"
if role == SECTION_ROLE_BODY and chapter_label:
return f"chapter-{slugify(chapter_label) or f'section-{spine_index + 1:03d}'}"
section_slug = (
slugify(title)
or slugify(Path(href).stem)
or f"section-{spine_index + 1:03d}"
)
return f"{book_slug}-{spine_index + 1:03d}-{section_slug}"
def _load_nav_labels(
archive: zipfile.ZipFile, manifest: dict[str, _EpubManifestItem]
) -> dict[str, str]:
nav_item = next(
(item for item in manifest.values() if "nav" in item.properties),
None,
)
if nav_item is None:
return {}
try:
raw = archive.read(nav_item.href).decode("utf-8", errors="replace")
except KeyError:
return {}
base = _zip_dirname(nav_item.href)
labels: dict[str, str] = {}
pattern = re.compile(
r"""<a[^>]*\bhref=(?:"(?P<dhref>[^"#]+)(?:#[^"]*)?"|'(?P<shref>[^'#]+)(?:#[^']*)?')[^>]*>(?P<label>.*?)</a>""",
re.I | re.S,
)
for match in pattern.finditer(raw):
href_attr = match.group("dhref") or match.group("shref") or ""
if not href_attr:
continue
label = _collapse_ws(_html_to_text(match.group("label")))
if not label:
continue
resolved = _join_zip_path(base, href_attr)
labels.setdefault(resolved, label)
return labels
def _first_heading_text(html_raw: str) -> str:
match = HTML_FIRST_HEADING_RE.search(html_raw)
if not match:
return ""
return _collapse_ws(_html_to_text(match.group("title")))
def _parse_chapter_number(label: str) -> int | None:
stripped = label.strip()
if not stripped:
return None
if stripped.isdigit():
return int(stripped)
roman_match = ROMAN_NUMERAL_RE.match(stripped)
if roman_match:
value = _roman_to_int(roman_match.group(1).upper())
if value > 0:
return value
chapter_match = CHAPTER_NUMBER_RE.match(stripped)
if chapter_match:
token = chapter_match.group("value")
if token.isdigit():
return int(token)
value = _roman_to_int(token.upper())
if value > 0:
return value
return None
def _roman_to_int(value: str) -> int:
table = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}
total = 0
prev = 0
for ch in reversed(value):
cur = table.get(ch, 0)
if cur == 0:
return 0
if cur < prev:
total -= cur
else:
total += cur
prev = cur
return total
def _inject_anchor_markers(raw: str) -> str:
def repl(match: re.Match) -> str:
anchor = match.group("danchor") or match.group("sanchor") or ""
if not anchor:
return match.group(0)
return f"{match.group(0)} ⟦anchor:{anchor}"
return ANCHOR_OPEN_TAG_RE.sub(repl, raw)
def _epub_legacy_documents(
archive: zipfile.ZipFile, source_path: Path
) -> Iterable[_SourceDocument]:
names = [
name
for name in sorted(archive.namelist())
if Path(name).suffix.lower() in {".html", ".htm", ".xhtml", ".txt", ".md"}
and not name.endswith("/")
]
for name in names:
raw = archive.read(name).decode("utf-8", errors="replace")
pseudo_path = Path(name)
if pseudo_path.suffix.lower() in {".txt", ".md"}:
title = _markdown_title(raw) or _title_from_path(pseudo_path)
markdown = _ensure_h1(_normalize_newlines(raw).strip() + "\n", title)
yield _SourceDocument(
title=title,
markdown=markdown,
source_type="epub",
original_path=f"{source_path}!{name}",
base_slug=slugify(title) or slugify(pseudo_path.stem) or "source",
)
else:
yield _html_document(
pseudo_path,
source_type="epub",
original_path=f"{source_path}!{name}",
text=raw,
)
def _classify_section(
item: _EpubManifestItem,
spine_entry: _EpubSpineEntry,
content: str,
) -> str:
name = Path(item.href).name.lower()
if "nav" in item.properties:
return SECTION_ROLE_NAV
if "cover-image" in item.properties:
return SECTION_ROLE_COVER
if name.startswith("cover") or "titlepage" in name:
return SECTION_ROLE_COVER
doc_title = re.sub(r"[^a-z0-9 ]+", "", _html_title(content).lower()).strip()
if doc_title in {"cover", "cover page", "title page", "titlepage"}:
return SECTION_ROLE_COVER
if name.startswith("nav"):
return SECTION_ROLE_NAV
if "toc" in name or "contents" in name:
return SECTION_ROLE_TOC
if "license" in name or "copyright" in name or "rights" in name:
return SECTION_ROLE_LICENSE
if "transcriber" in name or "notes" in name:
return SECTION_ROLE_NOTES
upper = content.upper()
if any(marker in upper for marker in PG_START_MARKERS):
return SECTION_ROLE_HEADER
if any(marker in upper for marker in PG_END_MARKERS):
return SECTION_ROLE_FOOTER
if "pgheader" in name or "pg-header" in name or "gutenberg-header" in name:
return SECTION_ROLE_HEADER
if "pgfooter" in name or "pg-footer" in name or "gutenberg-footer" in name:
return SECTION_ROLE_FOOTER
if not spine_entry.linear:
return SECTION_ROLE_AUXILIARY
return SECTION_ROLE_BODY
def _zip_dirname(zip_path: str) -> str:
normalized = zip_path.replace("\\", "/")
if "/" not in normalized:
return ""
return normalized.rsplit("/", 1)[0]
def _join_zip_path(base: str, href: str) -> str:
base = base.replace("\\", "/").strip("/")
href = href.replace("\\", "/").lstrip("/")
if not base or base == ".":
return href
return f"{base}/{href}"
def _split_document(
document: _SourceDocument,
*,
max_words: int,
overlap_words: int,
) -> list[tuple[str, str, list[str]]]:
text = document.markdown.strip()
heading = _markdown_title(text) or document.title or "Source"
body_with_markers = re.sub(r"(?m)^# .+?\n+", "", text, count=1).strip()
clean_body, anchor_positions = _extract_anchor_positions(body_with_markers)
words = clean_body.split()
if max_words <= 0 or len(words) <= max_words:
anchors = [name for name, _idx in anchor_positions]
return [(document.title, _compose_chunk(heading, clean_body), anchors)]
overlap = max(0, min(overlap_words, max_words - 1))
step = max_words - overlap if overlap > 0 else max_words
parts: list[tuple[str, str, list[str]]] = []
start = 0
while start < len(words):
end = min(start + max_words, len(words))
slice_words = words[start:end]
if not slice_words:
break
part_index = len(parts) + 1
part_text = " ".join(slice_words).strip()
part_anchors = _anchors_in_range(anchor_positions, start, end)
part_title = f"{document.title} Part {part_index}"
parts.append((part_title, _compose_chunk(heading, part_text), part_anchors))
if end >= len(words):
break
start += step
return parts
def _compose_chunk(heading: str, body: str) -> str:
body = body.strip()
if not body:
return f"# {heading}\n"
return f"# {heading}\n\n{body}\n"
def _extract_anchor_positions(text: str) -> tuple[str, list[tuple[str, int]]]:
parts: list[str] = []
anchors: list[tuple[str, int]] = []
cursor = 0
for match in ANCHOR_MARKER_RE.finditer(text):
prefix = text[cursor : match.start()]
parts.append(prefix)
word_index = sum(len(part.split()) for part in parts)
anchors.append((match.group("anchor"), word_index))
cursor = match.end()
parts.append(text[cursor:])
cleaned = re.sub(r"[ \t]{2,}", " ", "".join(parts)).strip()
return cleaned, anchors
def _anchors_in_range(
anchor_positions: list[tuple[str, int]], start: int, end: int
) -> list[str]:
seen: set[str] = set()
found: list[str] = []
for name, idx in anchor_positions:
if start <= idx < end and name not in seen:
seen.add(name)
found.append(name)
return found
def _html_title(raw: str) -> str:
match = HTML_TITLE_RE.search(raw) or HTML_H1_RE.search(raw)
if not match:
return ""
return _collapse_ws(_html_to_text(match.group("title")))
def _html_to_text(raw: str) -> str:
cleaned = re.sub(r"<head\b[^>]*>.*?</head>", " ", raw, flags=re.I | re.S)
cleaned = SCRIPT_STYLE_RE.sub(" ", cleaned)
cleaned = re.sub(r"</(p|div|section|article|h[1-6]|li)>", "\n", cleaned, flags=re.I)
cleaned = TAG_RE.sub(" ", cleaned)
cleaned = html.unescape(cleaned)
lines = [_collapse_ws(line) for line in cleaned.splitlines()]
return "\n\n".join(line for line in lines if line).strip()
def _ensure_h1(markdown: str, title: str) -> str:
if re.search(r"(?m)^#\s+\S", markdown):
return markdown
return f"# {title}\n\n{markdown.strip()}\n"
def _markdown_title(markdown: str) -> str:
match = re.search(r"(?m)^#\s+(?P<title>.+?)\s*$", markdown)
return match.group("title").strip() if match else ""
def _title_from_path(path: Path) -> str:
words = re.sub(r"[^A-Za-z0-9]+", " ", path.stem).strip()
return words.title() if words else "Source"
def _dedupe_chunk_id(base_id: str, used_ids: set[str]) -> str:
candidate = base_id or "source"
if candidate not in used_ids:
used_ids.add(candidate)
return candidate
index = 2
while f"{candidate}-{index}" in used_ids:
index += 1
deduped = f"{candidate}-{index}"
used_ids.add(deduped)
return deduped
def _digest_text(text: str) -> str:
return hashlib.sha256(text.encode("utf-8")).hexdigest()
def _collapse_ws(value: str) -> str:
return re.sub(r"\s+", " ", value).strip()
def _normalize_newlines(value: str) -> str:
return value.replace("\r\n", "\n").replace("\r", "\n")