infospace-bench/src/infospace_bench/source_intake.py

from __future__ import annotations

import hashlib
import html
import re
import zipfile
from dataclasses import asdict, dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterable
from xml.etree import ElementTree as ET

from .errors import InfospaceError
from .semantics import slugify

EXTRACTOR_VERSION = "generic-source-intake-v3"
SUPPORTED_EXTENSIONS = {".md", ".markdown", ".txt", ".html", ".htm", ".epub"}
HTML_TITLE_RE = re.compile(r"<title[^>]*>(?P<title>.*?)</title>", re.I | re.S)
HTML_H1_RE = re.compile(r"<h1[^>]*>(?P<title>.*?)</h1>", re.I | re.S)
HTML_FIRST_HEADING_RE = re.compile(
    r"<(?P<tag>h[1-6])[^>]*>(?P<title>.*?)</(?P=tag)>", re.I | re.S
)
SCRIPT_STYLE_RE = re.compile(r"<(script|style)[^>]*>.*?</\1>", re.I | re.S)
TAG_RE = re.compile(r"<[^>]+>")
ANCHOR_OPEN_TAG_RE = re.compile(
    r"""<(?P<name>[a-zA-Z][a-zA-Z0-9:-]*)[^>]*\bid=(?:"(?P<danchor>(?:Page_|page-|pg-|p-)[^"]*)"|'(?P<sanchor>(?:Page_|page-|pg-|p-)[^']*)')[^>]*>""",
    re.I,
)
ANCHOR_MARKER_RE = re.compile(r"⟦anchor:(?P<anchor>[^⟧]+)⟧")
ROMAN_NUMERAL_RE = re.compile(r"^([MDCLXVI]+)\.?$", re.I)
CHAPTER_NUMBER_RE = re.compile(
    r"^chapter\s+(?P<value>[ivxlcdm]+|\d+)\b",
    re.I,
)

OPF_NS = "http://www.idpf.org/2007/opf"
DC_NS = "http://purl.org/dc/elements/1.1/"
CONTAINER_NS = "urn:oasis:names:tc:opendocument:xmlns:container"
XHTML_NS = "http://www.w3.org/1999/xhtml"

SECTION_ROLE_BODY = "body"
SECTION_ROLE_COVER = "cover"
SECTION_ROLE_NAV = "nav"
SECTION_ROLE_TOC = "toc"
SECTION_ROLE_HEADER = "header"
SECTION_ROLE_FOOTER = "footer"
SECTION_ROLE_NOTES = "notes"
SECTION_ROLE_LICENSE = "license"
SECTION_ROLE_AUXILIARY = "auxiliary"

TOC_LABEL_TOKENS = {"contents", "table of contents", "toc"}
NOTES_LABEL_TOKENS = {
    "transcribers notes",
    "transcriber notes",
    "transcribers note",
    "transcribers comments",
    "editors notes",
    "editor notes",
}
LICENSE_LABEL_TOKENS = {
    "license",
    "project gutenberg license",
    "the project gutenberg license",
    "license terms",
    "copyright",
    "colophon",
}

PG_START_MARKERS = ("*** START OF THE PROJECT GUTENBERG EBOOK", "*** START OF THIS PROJECT GUTENBERG EBOOK")
PG_END_MARKERS = ("*** END OF THE PROJECT GUTENBERG EBOOK", "*** END OF THIS PROJECT GUTENBERG EBOOK")


@dataclass(frozen=True)
class SourceChunk:
    chunk_id: str
    title: str
    markdown: str
    source_type: str
    original_path: str
    digest: str
    chunk_index: int
    chunk_count: int
    imported_at: str
    extractor_version: str = EXTRACTOR_VERSION
    section_role: str = SECTION_ROLE_BODY
    spine_index: int | None = None
    book_metadata: dict = field(default_factory=dict)
    chapter_label: str | None = None
    chapter_number: int | None = None
    page_anchors: tuple = ()

    def to_dict(self) -> dict:
        return asdict(self)


@dataclass(frozen=True)
class _SourceDocument:
    title: str
    markdown: str
    source_type: str
    original_path: str
    base_slug: str
    section_role: str = SECTION_ROLE_BODY
    spine_index: int | None = None
    book_metadata: dict = field(default_factory=dict)
    chapter_label: str | None = None
    chapter_number: int | None = None


@dataclass(frozen=True)
class _EpubManifestItem:
    item_id: str
    href: str
    media_type: str
    properties: frozenset


@dataclass(frozen=True)
class _EpubSpineEntry:
    item_id: str
    linear: bool


def normalize_source(
    source: str | Path,
    *,
    max_words: int = 800,
    max_chunks: int | None = None,
    include_non_body: bool = False,
    overlap_words: int = 0,
) -> list[SourceChunk]:
    source_path = Path(source)
    if not source_path.exists():
        raise InfospaceError(
            "missing_source",
            f"Source path does not exist: {source_path}",
            {"source": str(source_path)},
        )
    documents = list(_iter_documents(source_path, include_non_body=include_non_body))
    if not documents:
        raise InfospaceError(
            "unsupported_source",
            f"No supported source documents found: {source_path}",
            {
                "source": str(source_path),
                "supported_extensions": sorted(SUPPORTED_EXTENSIONS),
            },
        )
    imported_at = datetime.now(timezone.utc).isoformat()
    chunks: list[SourceChunk] = []
    used_ids: set[str] = set()
    for document in documents:
        pieces = _split_document(document, max_words=max_words, overlap_words=overlap_words)
        for index, (part_title, part_markdown, part_anchors) in enumerate(pieces):
            base_id = (
                document.base_slug
                if len(pieces) == 1
                else f"{document.base_slug}-part-{index + 1:03d}"
            )
            chunk_id = _dedupe_chunk_id(base_id, used_ids)
            chunks.append(
                SourceChunk(
                    chunk_id=chunk_id,
                    title=part_title,
                    markdown=part_markdown,
                    source_type=document.source_type,
                    original_path=document.original_path,
                    digest=_digest_text(part_markdown),
                    chunk_index=index,
                    chunk_count=len(pieces),
                    imported_at=imported_at,
                    section_role=document.section_role,
                    spine_index=document.spine_index,
                    book_metadata=dict(document.book_metadata),
                    chapter_label=document.chapter_label,
                    chapter_number=document.chapter_number,
                    page_anchors=tuple(part_anchors),
                )
            )
            if max_chunks is not None and max_chunks > 0 and len(chunks) >= max_chunks:
                return chunks
    return chunks


def _iter_documents(
    source_path: Path, *, include_non_body: bool
) -> Iterable[_SourceDocument]:
    if source_path.is_dir():
        for path in sorted(source_path.rglob("*")):
            if path.is_file() and path.suffix.lower() in SUPPORTED_EXTENSIONS:
                yield from _iter_documents(path, include_non_body=include_non_body)
        return

    suffix = source_path.suffix.lower()
    if suffix in (".md", ".markdown"):
        yield _markdown_document(source_path)
    elif suffix == ".txt":
        yield _text_document(source_path)
    elif suffix in (".html", ".htm"):
        yield _html_document(source_path, source_type="html")
    elif suffix == ".epub":
        yield from _epub_documents(source_path, include_non_body=include_non_body)


def _markdown_document(path: Path) -> _SourceDocument:
    markdown = _normalize_newlines(path.read_text(encoding="utf-8")).strip() + "\n"
    title = _markdown_title(markdown) or _title_from_path(path)
    return _SourceDocument(
        title=title,
        markdown=_ensure_h1(markdown, title),
        source_type="markdown",
        original_path=str(path),
        base_slug=slugify(title) or slugify(path.stem) or "source",
    )


def _text_document(path: Path) -> _SourceDocument:
    title = _title_from_path(path)
    body = _normalize_newlines(path.read_text(encoding="utf-8")).strip()
    markdown = f"# {title}\n\n{body}\n"
    return _SourceDocument(
        title=title,
        markdown=markdown,
        source_type="text",
        original_path=str(path),
        base_slug=slugify(title) or "source",
    )


def _html_document(
    path: Path,
    *,
    source_type: str,
    original_path: str | None = None,
    text: str | None = None,
) -> _SourceDocument:
    raw = text if text is not None else path.read_text(encoding="utf-8")
    title = _html_title(raw) or _title_from_path(path)
    body = _html_to_text(raw)
    if body.lower().startswith(title.lower()):
        body = body[len(title) :].strip()
    markdown = f"# {title}\n\n{body}\n"
    return _SourceDocument(
        title=title,
        markdown=markdown,
        source_type=source_type,
        original_path=original_path or str(path),
        base_slug=slugify(title) or slugify(path.stem) or "source",
    )


def _epub_documents(
    path: Path, *, include_non_body: bool
) -> Iterable[_SourceDocument]:
    try:
        with zipfile.ZipFile(path) as archive:
            opf_path = _resolve_opf_path(archive)
            if opf_path is not None:
                yield from _epub3_spine_documents(
                    archive, path, opf_path, include_non_body=include_non_body
                )
            else:
                yield from _epub_legacy_documents(archive, path)
    except zipfile.BadZipFile as exc:
        raise InfospaceError(
            "invalid_epub_source",
            f"EPUB source is not a readable zip archive: {path}",
            {"source": str(path)},
        ) from exc


def _resolve_opf_path(archive: zipfile.ZipFile) -> str | None:
    try:
        raw = archive.read("META-INF/container.xml")
    except KeyError:
        return None
    try:
        root = ET.fromstring(raw)
    except ET.ParseError:
        return None
    rootfile = root.find(f"{{{CONTAINER_NS}}}rootfiles/{{{CONTAINER_NS}}}rootfile")
    if rootfile is None:
        return None
    full_path = rootfile.attrib.get("full-path")
    if not full_path:
        return None
    if full_path not in archive.namelist():
        return None
    return full_path


def _parse_opf(
    archive: zipfile.ZipFile, opf_path: str
) -> tuple[dict, dict[str, _EpubManifestItem], list[_EpubSpineEntry]]:
    raw = archive.read(opf_path).decode("utf-8", errors="replace")
    root = ET.fromstring(raw)
    metadata = _parse_opf_metadata(root)
    base = _zip_dirname(opf_path)
    manifest: dict[str, _EpubManifestItem] = {}
    for item in root.findall(f"{{{OPF_NS}}}manifest/{{{OPF_NS}}}item"):
        href = item.attrib.get("href", "")
        item_id = item.attrib.get("id", "")
        if not href or not item_id:
            continue
        manifest[item_id] = _EpubManifestItem(
            item_id=item_id,
            href=_join_zip_path(base, href),
            media_type=item.attrib.get("media-type", ""),
            properties=frozenset((item.attrib.get("properties") or "").split()),
        )
    spine: list[_EpubSpineEntry] = []
    for entry in root.findall(f"{{{OPF_NS}}}spine/{{{OPF_NS}}}itemref"):
        idref = entry.attrib.get("idref")
        if not idref:
            continue
        spine.append(
            _EpubSpineEntry(
                item_id=idref,
                linear=entry.attrib.get("linear", "yes") != "no",
            )
        )
    return metadata, manifest, spine


def _parse_opf_metadata(opf_root: ET.Element) -> dict:
    md = opf_root.find(f"{{{OPF_NS}}}metadata")
    if md is None:
        return {}

    def _first_text(tag: str) -> str:
        el = md.find(f"{{{DC_NS}}}{tag}")
        return _collapse_ws(el.text) if el is not None and el.text else ""

    def _all_text(tag: str) -> list[str]:
        return [
            _collapse_ws(el.text)
            for el in md.findall(f"{{{DC_NS}}}{tag}")
            if el is not None and el.text
        ]

    out: dict = {}
    title = _first_text("title")
    if title:
        out["title"] = title
    creators = _all_text("creator")
    if creators:
        out["creator"] = creators[0]
        if len(creators) > 1:
            out["creators"] = creators
    language = _first_text("language")
    if language:
        out["language"] = language
    rights = _first_text("rights")
    if rights:
        out["rights"] = rights
    subjects = _all_text("subject")
    if subjects:
        out["subjects"] = subjects
    identifier = _first_text("identifier")
    if identifier:
        out["identifier"] = identifier
    source_url = _first_text("source")
    if source_url:
        out["source_url"] = source_url
    for meta in md.findall(f"{{{OPF_NS}}}meta"):
        prop = meta.attrib.get("property", "")
        text = _collapse_ws(meta.text) if meta.text else ""
        if not text:
            continue
        if prop == "dcterms:modified":
            out["modified"] = text
        elif prop == "dcterms:source" and "source_url" not in out:
            out["source_url"] = text
    return out


def _epub3_spine_documents(
    archive: zipfile.ZipFile,
    source_path: Path,
    opf_path: str,
    *,
    include_non_body: bool,
) -> Iterable[_SourceDocument]:
    metadata, manifest, spine = _parse_opf(archive, opf_path)
    book_title = metadata.get("title") or _title_from_path(source_path)
    book_slug = slugify(book_title) or slugify(source_path.stem) or "ebook"
    nav_labels = _load_nav_labels(archive, manifest)
    chapter_counter = 0
    used_chapter_numbers: set[int] = set()
    for spine_index, entry in enumerate(spine):
        item = manifest.get(entry.item_id)
        if item is None or not item.href:
            continue
        try:
            raw = archive.read(item.href).decode("utf-8", errors="replace")
        except KeyError:
            continue
        role = _classify_section(item, entry, raw)
        nav_label = nav_labels.get(item.href, "")
        heading_label = _first_heading_text(raw) if Path(item.href).suffix.lower() not in {".txt", ".md"} else ""
        chapter_label = nav_label or heading_label or None
        # Reclassify body sections whose chapter label matches known noise tokens
        if role == SECTION_ROLE_BODY and chapter_label:
            normalized_label = re.sub(r"[^a-z0-9 ]+", "", chapter_label.lower()).strip()
            if normalized_label in TOC_LABEL_TOKENS:
                role = SECTION_ROLE_TOC
            elif normalized_label in NOTES_LABEL_TOKENS:
                role = SECTION_ROLE_NOTES
            elif normalized_label in LICENSE_LABEL_TOKENS:
                role = SECTION_ROLE_LICENSE
        if role != SECTION_ROLE_BODY and not include_non_body:
            continue
        chapter_number: int | None = None
        if role == SECTION_ROLE_BODY and chapter_label:
            chapter_number = _parse_chapter_number(chapter_label)
        if role == SECTION_ROLE_BODY and chapter_number is None:
            # Fall back to sequential body counter when chapter label exists but
            # is not a roman/arabic numeral (e.g. "Preface").
            if chapter_label:
                chapter_counter += 1
                # Use sequential only when no other body has claimed this slot;
                # roman-numeral chapters take precedence and may overlap, so we
                # leave chapter_number=None for non-numeric labels and let the
                # slug fall back to the label slug.
        if chapter_number is not None:
            if chapter_number in used_chapter_numbers:
                # Duplicate numeric label across the book — keep label, drop the
                # numeric slot so the slug falls back to label-based naming.
                chapter_number = None
            else:
                used_chapter_numbers.add(chapter_number)
        suffix = Path(item.href).suffix.lower()
        if suffix in {".txt", ".md"}:
            title = chapter_label or _markdown_title(raw) or _title_from_path(Path(item.href))
            markdown_body = _ensure_h1(_normalize_newlines(raw).strip() + "\n", title)
        else:
            title = chapter_label or _html_title(raw) or _title_from_path(Path(item.href))
            marked = _inject_anchor_markers(raw)
            text = _html_to_text(marked)
            if text.lower().startswith(title.lower()):
                text = text[len(title) :].strip()
            markdown_body = f"# {title}\n\n{text}\n"
        base_slug = _chapter_base_slug(
            role=role,
            chapter_number=chapter_number,
            chapter_label=chapter_label,
            book_slug=book_slug,
            spine_index=spine_index,
            href=item.href,
            title=title,
        )
        yield _SourceDocument(
            title=title,
            markdown=markdown_body,
            source_type="epub",
            original_path=f"{source_path}!{item.href}",
            base_slug=base_slug,
            section_role=role,
            spine_index=spine_index,
            book_metadata=metadata,
            chapter_label=chapter_label,
            chapter_number=chapter_number,
        )


def _chapter_base_slug(
    *,
    role: str,
    chapter_number: int | None,
    chapter_label: str | None,
    book_slug: str,
    spine_index: int,
    href: str,
    title: str,
) -> str:
    if role == SECTION_ROLE_BODY and chapter_number is not None:
        return f"chapter-{chapter_number:02d}"
    if role == SECTION_ROLE_BODY and chapter_label:
        return f"chapter-{slugify(chapter_label) or f'section-{spine_index + 1:03d}'}"
    section_slug = (
        slugify(title)
        or slugify(Path(href).stem)
        or f"section-{spine_index + 1:03d}"
    )
    return f"{book_slug}-{spine_index + 1:03d}-{section_slug}"


def _load_nav_labels(
    archive: zipfile.ZipFile, manifest: dict[str, _EpubManifestItem]
) -> dict[str, str]:
    nav_item = next(
        (item for item in manifest.values() if "nav" in item.properties),
        None,
    )
    if nav_item is None:
        return {}
    try:
        raw = archive.read(nav_item.href).decode("utf-8", errors="replace")
    except KeyError:
        return {}
    base = _zip_dirname(nav_item.href)
    labels: dict[str, str] = {}
    pattern = re.compile(
        r"""<a[^>]*\bhref=(?:"(?P<dhref>[^"#]+)(?:#[^"]*)?"|'(?P<shref>[^'#]+)(?:#[^']*)?')[^>]*>(?P<label>.*?)</a>""",
        re.I | re.S,
    )
    for match in pattern.finditer(raw):
        href_attr = match.group("dhref") or match.group("shref") or ""
        if not href_attr:
            continue
        label = _collapse_ws(_html_to_text(match.group("label")))
        if not label:
            continue
        resolved = _join_zip_path(base, href_attr)
        labels.setdefault(resolved, label)
    return labels


def _first_heading_text(html_raw: str) -> str:
    match = HTML_FIRST_HEADING_RE.search(html_raw)
    if not match:
        return ""
    return _collapse_ws(_html_to_text(match.group("title")))


def _parse_chapter_number(label: str) -> int | None:
    stripped = label.strip()
    if not stripped:
        return None
    if stripped.isdigit():
        return int(stripped)
    roman_match = ROMAN_NUMERAL_RE.match(stripped)
    if roman_match:
        value = _roman_to_int(roman_match.group(1).upper())
        if value > 0:
            return value
    chapter_match = CHAPTER_NUMBER_RE.match(stripped)
    if chapter_match:
        token = chapter_match.group("value")
        if token.isdigit():
            return int(token)
        value = _roman_to_int(token.upper())
        if value > 0:
            return value
    return None


def _roman_to_int(value: str) -> int:
    table = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}
    total = 0
    prev = 0
    for ch in reversed(value):
        cur = table.get(ch, 0)
        if cur == 0:
            return 0
        if cur < prev:
            total -= cur
        else:
            total += cur
        prev = cur
    return total


def _inject_anchor_markers(raw: str) -> str:
    def repl(match: re.Match) -> str:
        anchor = match.group("danchor") or match.group("sanchor") or ""
        if not anchor:
            return match.group(0)
        return f"{match.group(0)} ⟦anchor:{anchor}⟧ "

    return ANCHOR_OPEN_TAG_RE.sub(repl, raw)


def _epub_legacy_documents(
    archive: zipfile.ZipFile, source_path: Path
) -> Iterable[_SourceDocument]:
    names = [
        name
        for name in sorted(archive.namelist())
        if Path(name).suffix.lower() in {".html", ".htm", ".xhtml", ".txt", ".md"}
        and not name.endswith("/")
    ]
    for name in names:
        raw = archive.read(name).decode("utf-8", errors="replace")
        pseudo_path = Path(name)
        if pseudo_path.suffix.lower() in {".txt", ".md"}:
            title = _markdown_title(raw) or _title_from_path(pseudo_path)
            markdown = _ensure_h1(_normalize_newlines(raw).strip() + "\n", title)
            yield _SourceDocument(
                title=title,
                markdown=markdown,
                source_type="epub",
                original_path=f"{source_path}!{name}",
                base_slug=slugify(title) or slugify(pseudo_path.stem) or "source",
            )
        else:
            yield _html_document(
                pseudo_path,
                source_type="epub",
                original_path=f"{source_path}!{name}",
                text=raw,
            )


def _classify_section(
    item: _EpubManifestItem,
    spine_entry: _EpubSpineEntry,
    content: str,
) -> str:
    name = Path(item.href).name.lower()
    if "nav" in item.properties:
        return SECTION_ROLE_NAV
    if "cover-image" in item.properties:
        return SECTION_ROLE_COVER
    if name.startswith("cover") or "titlepage" in name:
        return SECTION_ROLE_COVER
    doc_title = re.sub(r"[^a-z0-9 ]+", "", _html_title(content).lower()).strip()
    if doc_title in {"cover", "cover page", "title page", "titlepage"}:
        return SECTION_ROLE_COVER
    if name.startswith("nav"):
        return SECTION_ROLE_NAV
    if "toc" in name or "contents" in name:
        return SECTION_ROLE_TOC
    if "license" in name or "copyright" in name or "rights" in name:
        return SECTION_ROLE_LICENSE
    if "transcriber" in name or "notes" in name:
        return SECTION_ROLE_NOTES
    upper = content.upper()
    if any(marker in upper for marker in PG_START_MARKERS):
        return SECTION_ROLE_HEADER
    if any(marker in upper for marker in PG_END_MARKERS):
        return SECTION_ROLE_FOOTER
    if "pgheader" in name or "pg-header" in name or "gutenberg-header" in name:
        return SECTION_ROLE_HEADER
    if "pgfooter" in name or "pg-footer" in name or "gutenberg-footer" in name:
        return SECTION_ROLE_FOOTER
    if not spine_entry.linear:
        return SECTION_ROLE_AUXILIARY
    return SECTION_ROLE_BODY


def _zip_dirname(zip_path: str) -> str:
    normalized = zip_path.replace("\\", "/")
    if "/" not in normalized:
        return ""
    return normalized.rsplit("/", 1)[0]


def _join_zip_path(base: str, href: str) -> str:
    base = base.replace("\\", "/").strip("/")
    href = href.replace("\\", "/").lstrip("/")
    if not base or base == ".":
        return href
    return f"{base}/{href}"


def _split_document(
    document: _SourceDocument,
    *,
    max_words: int,
    overlap_words: int,
) -> list[tuple[str, str, list[str]]]:
    text = document.markdown.strip()
    heading = _markdown_title(text) or document.title or "Source"
    body_with_markers = re.sub(r"(?m)^# .+?\n+", "", text, count=1).strip()
    clean_body, anchor_positions = _extract_anchor_positions(body_with_markers)
    words = clean_body.split()
    if max_words <= 0 or len(words) <= max_words:
        anchors = [name for name, _idx in anchor_positions]
        return [(document.title, _compose_chunk(heading, clean_body), anchors)]
    overlap = max(0, min(overlap_words, max_words - 1))
    step = max_words - overlap if overlap > 0 else max_words
    parts: list[tuple[str, str, list[str]]] = []
    start = 0
    while start < len(words):
        end = min(start + max_words, len(words))
        slice_words = words[start:end]
        if not slice_words:
            break
        part_index = len(parts) + 1
        part_text = " ".join(slice_words).strip()
        part_anchors = _anchors_in_range(anchor_positions, start, end)
        part_title = f"{document.title} Part {part_index}"
        parts.append((part_title, _compose_chunk(heading, part_text), part_anchors))
        if end >= len(words):
            break
        start += step
    return parts


def _compose_chunk(heading: str, body: str) -> str:
    body = body.strip()
    if not body:
        return f"# {heading}\n"
    return f"# {heading}\n\n{body}\n"


def _extract_anchor_positions(text: str) -> tuple[str, list[tuple[str, int]]]:
    parts: list[str] = []
    anchors: list[tuple[str, int]] = []
    cursor = 0
    for match in ANCHOR_MARKER_RE.finditer(text):
        prefix = text[cursor : match.start()]
        parts.append(prefix)
        word_index = sum(len(part.split()) for part in parts)
        anchors.append((match.group("anchor"), word_index))
        cursor = match.end()
    parts.append(text[cursor:])
    cleaned = re.sub(r"[ \t]{2,}", " ", "".join(parts)).strip()
    return cleaned, anchors


def _anchors_in_range(
    anchor_positions: list[tuple[str, int]], start: int, end: int
) -> list[str]:
    seen: set[str] = set()
    found: list[str] = []
    for name, idx in anchor_positions:
        if start <= idx < end and name not in seen:
            seen.add(name)
            found.append(name)
    return found


def _html_title(raw: str) -> str:
    match = HTML_TITLE_RE.search(raw) or HTML_H1_RE.search(raw)
    if not match:
        return ""
    return _collapse_ws(_html_to_text(match.group("title")))


def _html_to_text(raw: str) -> str:
    cleaned = re.sub(r"<head\b[^>]*>.*?</head>", " ", raw, flags=re.I | re.S)
    cleaned = SCRIPT_STYLE_RE.sub(" ", cleaned)
    cleaned = re.sub(r"</(p|div|section|article|h[1-6]|li)>", "\n", cleaned, flags=re.I)
    cleaned = TAG_RE.sub(" ", cleaned)
    cleaned = html.unescape(cleaned)
    lines = [_collapse_ws(line) for line in cleaned.splitlines()]
    return "\n\n".join(line for line in lines if line).strip()


def _ensure_h1(markdown: str, title: str) -> str:
    if re.search(r"(?m)^#\s+\S", markdown):
        return markdown
    return f"# {title}\n\n{markdown.strip()}\n"


def _markdown_title(markdown: str) -> str:
    match = re.search(r"(?m)^#\s+(?P<title>.+?)\s*$", markdown)
    return match.group("title").strip() if match else ""


def _title_from_path(path: Path) -> str:
    words = re.sub(r"[^A-Za-z0-9]+", " ", path.stem).strip()
    return words.title() if words else "Source"


def _dedupe_chunk_id(base_id: str, used_ids: set[str]) -> str:
    candidate = base_id or "source"
    if candidate not in used_ids:
        used_ids.add(candidate)
        return candidate
    index = 2
    while f"{candidate}-{index}" in used_ids:
        index += 1
    deduped = f"{candidate}-{index}"
    used_ids.add(deduped)
    return deduped


def _digest_text(text: str) -> str:
    return hashlib.sha256(text.encode("utf-8")).hexdigest()


def _collapse_ws(value: str) -> str:
    return re.sub(r"\s+", " ", value).strip()


def _normalize_newlines(value: str) -> str:
    return value.replace("\r\n", "\n").replace("\r", "\n")