from __future__ import annotations import hashlib import html import re import zipfile from dataclasses import asdict, dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Iterable from xml.etree import ElementTree as ET from .errors import InfospaceError from .semantics import slugify EXTRACTOR_VERSION = "generic-source-intake-v3" SUPPORTED_EXTENSIONS = {".md", ".markdown", ".txt", ".html", ".htm", ".epub"} HTML_TITLE_RE = re.compile(r"]*>(?P.*?)", re.I | re.S) HTML_H1_RE = re.compile(r"]*>(?P.*?)</h1>", re.I | re.S) HTML_FIRST_HEADING_RE = re.compile( r"<(?P<tag>h[1-6])[^>]*>(?P<title>.*?)</(?P=tag)>", re.I | re.S ) SCRIPT_STYLE_RE = re.compile(r"<(script|style)[^>]*>.*?</\1>", re.I | re.S) TAG_RE = re.compile(r"<[^>]+>") ANCHOR_OPEN_TAG_RE = re.compile( r"""<(?P<name>[a-zA-Z][a-zA-Z0-9:-]*)[^>]*\bid=(?:"(?P<danchor>(?:Page_|page-|pg-|p-)[^"]*)"|'(?P<sanchor>(?:Page_|page-|pg-|p-)[^']*)')[^>]*>""", re.I, ) ANCHOR_MARKER_RE = re.compile(r"⟦anchor:(?P<anchor>[^⟧]+)⟧") ROMAN_NUMERAL_RE = re.compile(r"^([MDCLXVI]+)\.?$", re.I) CHAPTER_NUMBER_RE = re.compile( r"^chapter\s+(?P<value>[ivxlcdm]+|\d+)\b", re.I, ) OPF_NS = "http://www.idpf.org/2007/opf" DC_NS = "http://purl.org/dc/elements/1.1/" CONTAINER_NS = "urn:oasis:names:tc:opendocument:xmlns:container" XHTML_NS = "http://www.w3.org/1999/xhtml" SECTION_ROLE_BODY = "body" SECTION_ROLE_COVER = "cover" SECTION_ROLE_NAV = "nav" SECTION_ROLE_TOC = "toc" SECTION_ROLE_HEADER = "header" SECTION_ROLE_FOOTER = "footer" SECTION_ROLE_NOTES = "notes" SECTION_ROLE_LICENSE = "license" SECTION_ROLE_AUXILIARY = "auxiliary" TOC_LABEL_TOKENS = {"contents", "table of contents", "toc"} NOTES_LABEL_TOKENS = { "transcribers notes", "transcriber notes", "transcribers note", "transcribers comments", "editors notes", "editor notes", } LICENSE_LABEL_TOKENS = { "license", "project gutenberg license", "the project gutenberg license", "license terms", "copyright", "colophon", } PG_START_MARKERS = ("*** START OF THE PROJECT GUTENBERG EBOOK", "*** START OF THIS PROJECT GUTENBERG EBOOK") PG_END_MARKERS = ("*** END OF THE PROJECT GUTENBERG EBOOK", "*** END OF THIS PROJECT GUTENBERG EBOOK") @dataclass(frozen=True) class SourceChunk: chunk_id: str title: str markdown: str source_type: str original_path: str digest: str chunk_index: int chunk_count: int imported_at: str extractor_version: str = EXTRACTOR_VERSION section_role: str = SECTION_ROLE_BODY spine_index: int | None = None book_metadata: dict = field(default_factory=dict) chapter_label: str | None = None chapter_number: int | None = None page_anchors: tuple = () def to_dict(self) -> dict: return asdict(self) @dataclass(frozen=True) class _SourceDocument: title: str markdown: str source_type: str original_path: str base_slug: str section_role: str = SECTION_ROLE_BODY spine_index: int | None = None book_metadata: dict = field(default_factory=dict) chapter_label: str | None = None chapter_number: int | None = None @dataclass(frozen=True) class _EpubManifestItem: item_id: str href: str media_type: str properties: frozenset @dataclass(frozen=True) class _EpubSpineEntry: item_id: str linear: bool def normalize_source( source: str | Path, *, max_words: int = 800, max_chunks: int | None = None, include_non_body: bool = False, overlap_words: int = 0, ) -> list[SourceChunk]: source_path = Path(source) if not source_path.exists(): raise InfospaceError( "missing_source", f"Source path does not exist: {source_path}", {"source": str(source_path)}, ) documents = list(_iter_documents(source_path, include_non_body=include_non_body)) if not documents: raise InfospaceError( "unsupported_source", f"No supported source documents found: {source_path}", { "source": str(source_path), "supported_extensions": sorted(SUPPORTED_EXTENSIONS), }, ) imported_at = datetime.now(timezone.utc).isoformat() chunks: list[SourceChunk] = [] used_ids: set[str] = set() for document in documents: pieces = _split_document(document, max_words=max_words, overlap_words=overlap_words) for index, (part_title, part_markdown, part_anchors) in enumerate(pieces): base_id = ( document.base_slug if len(pieces) == 1 else f"{document.base_slug}-part-{index + 1:03d}" ) chunk_id = _dedupe_chunk_id(base_id, used_ids) chunks.append( SourceChunk( chunk_id=chunk_id, title=part_title, markdown=part_markdown, source_type=document.source_type, original_path=document.original_path, digest=_digest_text(part_markdown), chunk_index=index, chunk_count=len(pieces), imported_at=imported_at, section_role=document.section_role, spine_index=document.spine_index, book_metadata=dict(document.book_metadata), chapter_label=document.chapter_label, chapter_number=document.chapter_number, page_anchors=tuple(part_anchors), ) ) if max_chunks is not None and max_chunks > 0 and len(chunks) >= max_chunks: return chunks return chunks def _iter_documents( source_path: Path, *, include_non_body: bool ) -> Iterable[_SourceDocument]: if source_path.is_dir(): for path in sorted(source_path.rglob("*")): if path.is_file() and path.suffix.lower() in SUPPORTED_EXTENSIONS: yield from _iter_documents(path, include_non_body=include_non_body) return suffix = source_path.suffix.lower() if suffix in (".md", ".markdown"): yield _markdown_document(source_path) elif suffix == ".txt": yield _text_document(source_path) elif suffix in (".html", ".htm"): yield _html_document(source_path, source_type="html") elif suffix == ".epub": yield from _epub_documents(source_path, include_non_body=include_non_body) def _markdown_document(path: Path) -> _SourceDocument: markdown = _normalize_newlines(path.read_text(encoding="utf-8")).strip() + "\n" title = _markdown_title(markdown) or _title_from_path(path) return _SourceDocument( title=title, markdown=_ensure_h1(markdown, title), source_type="markdown", original_path=str(path), base_slug=slugify(title) or slugify(path.stem) or "source", ) def _text_document(path: Path) -> _SourceDocument: title = _title_from_path(path) body = _normalize_newlines(path.read_text(encoding="utf-8")).strip() markdown = f"# {title}\n\n{body}\n" return _SourceDocument( title=title, markdown=markdown, source_type="text", original_path=str(path), base_slug=slugify(title) or "source", ) def _html_document( path: Path, *, source_type: str, original_path: str | None = None, text: str | None = None, ) -> _SourceDocument: raw = text if text is not None else path.read_text(encoding="utf-8") title = _html_title(raw) or _title_from_path(path) body = _html_to_text(raw) if body.lower().startswith(title.lower()): body = body[len(title) :].strip() markdown = f"# {title}\n\n{body}\n" return _SourceDocument( title=title, markdown=markdown, source_type=source_type, original_path=original_path or str(path), base_slug=slugify(title) or slugify(path.stem) or "source", ) def _epub_documents( path: Path, *, include_non_body: bool ) -> Iterable[_SourceDocument]: try: with zipfile.ZipFile(path) as archive: opf_path = _resolve_opf_path(archive) if opf_path is not None: yield from _epub3_spine_documents( archive, path, opf_path, include_non_body=include_non_body ) else: yield from _epub_legacy_documents(archive, path) except zipfile.BadZipFile as exc: raise InfospaceError( "invalid_epub_source", f"EPUB source is not a readable zip archive: {path}", {"source": str(path)}, ) from exc def _resolve_opf_path(archive: zipfile.ZipFile) -> str | None: try: raw = archive.read("META-INF/container.xml") except KeyError: return None try: root = ET.fromstring(raw) except ET.ParseError: return None rootfile = root.find(f"{{{CONTAINER_NS}}}rootfiles/{{{CONTAINER_NS}}}rootfile") if rootfile is None: return None full_path = rootfile.attrib.get("full-path") if not full_path: return None if full_path not in archive.namelist(): return None return full_path def _parse_opf( archive: zipfile.ZipFile, opf_path: str ) -> tuple[dict, dict[str, _EpubManifestItem], list[_EpubSpineEntry]]: raw = archive.read(opf_path).decode("utf-8", errors="replace") root = ET.fromstring(raw) metadata = _parse_opf_metadata(root) base = _zip_dirname(opf_path) manifest: dict[str, _EpubManifestItem] = {} for item in root.findall(f"{{{OPF_NS}}}manifest/{{{OPF_NS}}}item"): href = item.attrib.get("href", "") item_id = item.attrib.get("id", "") if not href or not item_id: continue manifest[item_id] = _EpubManifestItem( item_id=item_id, href=_join_zip_path(base, href), media_type=item.attrib.get("media-type", ""), properties=frozenset((item.attrib.get("properties") or "").split()), ) spine: list[_EpubSpineEntry] = [] for entry in root.findall(f"{{{OPF_NS}}}spine/{{{OPF_NS}}}itemref"): idref = entry.attrib.get("idref") if not idref: continue spine.append( _EpubSpineEntry( item_id=idref, linear=entry.attrib.get("linear", "yes") != "no", ) ) return metadata, manifest, spine def _parse_opf_metadata(opf_root: ET.Element) -> dict: md = opf_root.find(f"{{{OPF_NS}}}metadata") if md is None: return {} def _first_text(tag: str) -> str: el = md.find(f"{{{DC_NS}}}{tag}") return _collapse_ws(el.text) if el is not None and el.text else "" def _all_text(tag: str) -> list[str]: return [ _collapse_ws(el.text) for el in md.findall(f"{{{DC_NS}}}{tag}") if el is not None and el.text ] out: dict = {} title = _first_text("title") if title: out["title"] = title creators = _all_text("creator") if creators: out["creator"] = creators[0] if len(creators) > 1: out["creators"] = creators language = _first_text("language") if language: out["language"] = language rights = _first_text("rights") if rights: out["rights"] = rights subjects = _all_text("subject") if subjects: out["subjects"] = subjects identifier = _first_text("identifier") if identifier: out["identifier"] = identifier source_url = _first_text("source") if source_url: out["source_url"] = source_url for meta in md.findall(f"{{{OPF_NS}}}meta"): prop = meta.attrib.get("property", "") text = _collapse_ws(meta.text) if meta.text else "" if not text: continue if prop == "dcterms:modified": out["modified"] = text elif prop == "dcterms:source" and "source_url" not in out: out["source_url"] = text return out def _epub3_spine_documents( archive: zipfile.ZipFile, source_path: Path, opf_path: str, *, include_non_body: bool, ) -> Iterable[_SourceDocument]: metadata, manifest, spine = _parse_opf(archive, opf_path) book_title = metadata.get("title") or _title_from_path(source_path) book_slug = slugify(book_title) or slugify(source_path.stem) or "ebook" nav_labels = _load_nav_labels(archive, manifest) chapter_counter = 0 used_chapter_numbers: set[int] = set() for spine_index, entry in enumerate(spine): item = manifest.get(entry.item_id) if item is None or not item.href: continue try: raw = archive.read(item.href).decode("utf-8", errors="replace") except KeyError: continue role = _classify_section(item, entry, raw) nav_label = nav_labels.get(item.href, "") heading_label = _first_heading_text(raw) if Path(item.href).suffix.lower() not in {".txt", ".md"} else "" chapter_label = nav_label or heading_label or None # Reclassify body sections whose chapter label matches known noise tokens if role == SECTION_ROLE_BODY and chapter_label: normalized_label = re.sub(r"[^a-z0-9 ]+", "", chapter_label.lower()).strip() if normalized_label in TOC_LABEL_TOKENS: role = SECTION_ROLE_TOC elif normalized_label in NOTES_LABEL_TOKENS: role = SECTION_ROLE_NOTES elif normalized_label in LICENSE_LABEL_TOKENS: role = SECTION_ROLE_LICENSE if role != SECTION_ROLE_BODY and not include_non_body: continue chapter_number: int | None = None if role == SECTION_ROLE_BODY and chapter_label: chapter_number = _parse_chapter_number(chapter_label) if role == SECTION_ROLE_BODY and chapter_number is None: # Fall back to sequential body counter when chapter label exists but # is not a roman/arabic numeral (e.g. "Preface"). if chapter_label: chapter_counter += 1 # Use sequential only when no other body has claimed this slot; # roman-numeral chapters take precedence and may overlap, so we # leave chapter_number=None for non-numeric labels and let the # slug fall back to the label slug. if chapter_number is not None: if chapter_number in used_chapter_numbers: # Duplicate numeric label across the book — keep label, drop the # numeric slot so the slug falls back to label-based naming. chapter_number = None else: used_chapter_numbers.add(chapter_number) suffix = Path(item.href).suffix.lower() if suffix in {".txt", ".md"}: title = chapter_label or _markdown_title(raw) or _title_from_path(Path(item.href)) markdown_body = _ensure_h1(_normalize_newlines(raw).strip() + "\n", title) else: title = chapter_label or _html_title(raw) or _title_from_path(Path(item.href)) marked = _inject_anchor_markers(raw) text = _html_to_text(marked) if text.lower().startswith(title.lower()): text = text[len(title) :].strip() markdown_body = f"# {title}\n\n{text}\n" base_slug = _chapter_base_slug( role=role, chapter_number=chapter_number, chapter_label=chapter_label, book_slug=book_slug, spine_index=spine_index, href=item.href, title=title, ) yield _SourceDocument( title=title, markdown=markdown_body, source_type="epub", original_path=f"{source_path}!{item.href}", base_slug=base_slug, section_role=role, spine_index=spine_index, book_metadata=metadata, chapter_label=chapter_label, chapter_number=chapter_number, ) def _chapter_base_slug( *, role: str, chapter_number: int | None, chapter_label: str | None, book_slug: str, spine_index: int, href: str, title: str, ) -> str: if role == SECTION_ROLE_BODY and chapter_number is not None: return f"chapter-{chapter_number:02d}" if role == SECTION_ROLE_BODY and chapter_label: return f"chapter-{slugify(chapter_label) or f'section-{spine_index + 1:03d}'}" section_slug = ( slugify(title) or slugify(Path(href).stem) or f"section-{spine_index + 1:03d}" ) return f"{book_slug}-{spine_index + 1:03d}-{section_slug}" def _load_nav_labels( archive: zipfile.ZipFile, manifest: dict[str, _EpubManifestItem] ) -> dict[str, str]: nav_item = next( (item for item in manifest.values() if "nav" in item.properties), None, ) if nav_item is None: return {} try: raw = archive.read(nav_item.href).decode("utf-8", errors="replace") except KeyError: return {} base = _zip_dirname(nav_item.href) labels: dict[str, str] = {} pattern = re.compile( r"""<a[^>]*\bhref=(?:"(?P<dhref>[^"#]+)(?:#[^"]*)?"|'(?P<shref>[^'#]+)(?:#[^']*)?')[^>]*>(?P<label>.*?)</a>""", re.I | re.S, ) for match in pattern.finditer(raw): href_attr = match.group("dhref") or match.group("shref") or "" if not href_attr: continue label = _collapse_ws(_html_to_text(match.group("label"))) if not label: continue resolved = _join_zip_path(base, href_attr) labels.setdefault(resolved, label) return labels def _first_heading_text(html_raw: str) -> str: match = HTML_FIRST_HEADING_RE.search(html_raw) if not match: return "" return _collapse_ws(_html_to_text(match.group("title"))) def _parse_chapter_number(label: str) -> int | None: stripped = label.strip() if not stripped: return None if stripped.isdigit(): return int(stripped) roman_match = ROMAN_NUMERAL_RE.match(stripped) if roman_match: value = _roman_to_int(roman_match.group(1).upper()) if value > 0: return value chapter_match = CHAPTER_NUMBER_RE.match(stripped) if chapter_match: token = chapter_match.group("value") if token.isdigit(): return int(token) value = _roman_to_int(token.upper()) if value > 0: return value return None def _roman_to_int(value: str) -> int: table = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000} total = 0 prev = 0 for ch in reversed(value): cur = table.get(ch, 0) if cur == 0: return 0 if cur < prev: total -= cur else: total += cur prev = cur return total def _inject_anchor_markers(raw: str) -> str: def repl(match: re.Match) -> str: anchor = match.group("danchor") or match.group("sanchor") or "" if not anchor: return match.group(0) return f"{match.group(0)} ⟦anchor:{anchor}⟧ " return ANCHOR_OPEN_TAG_RE.sub(repl, raw) def _epub_legacy_documents( archive: zipfile.ZipFile, source_path: Path ) -> Iterable[_SourceDocument]: names = [ name for name in sorted(archive.namelist()) if Path(name).suffix.lower() in {".html", ".htm", ".xhtml", ".txt", ".md"} and not name.endswith("/") ] for name in names: raw = archive.read(name).decode("utf-8", errors="replace") pseudo_path = Path(name) if pseudo_path.suffix.lower() in {".txt", ".md"}: title = _markdown_title(raw) or _title_from_path(pseudo_path) markdown = _ensure_h1(_normalize_newlines(raw).strip() + "\n", title) yield _SourceDocument( title=title, markdown=markdown, source_type="epub", original_path=f"{source_path}!{name}", base_slug=slugify(title) or slugify(pseudo_path.stem) or "source", ) else: yield _html_document( pseudo_path, source_type="epub", original_path=f"{source_path}!{name}", text=raw, ) def _classify_section( item: _EpubManifestItem, spine_entry: _EpubSpineEntry, content: str, ) -> str: name = Path(item.href).name.lower() if "nav" in item.properties: return SECTION_ROLE_NAV if "cover-image" in item.properties: return SECTION_ROLE_COVER if name.startswith("cover") or "titlepage" in name: return SECTION_ROLE_COVER doc_title = re.sub(r"[^a-z0-9 ]+", "", _html_title(content).lower()).strip() if doc_title in {"cover", "cover page", "title page", "titlepage"}: return SECTION_ROLE_COVER if name.startswith("nav"): return SECTION_ROLE_NAV if "toc" in name or "contents" in name: return SECTION_ROLE_TOC if "license" in name or "copyright" in name or "rights" in name: return SECTION_ROLE_LICENSE if "transcriber" in name or "notes" in name: return SECTION_ROLE_NOTES upper = content.upper() if any(marker in upper for marker in PG_START_MARKERS): return SECTION_ROLE_HEADER if any(marker in upper for marker in PG_END_MARKERS): return SECTION_ROLE_FOOTER if "pgheader" in name or "pg-header" in name or "gutenberg-header" in name: return SECTION_ROLE_HEADER if "pgfooter" in name or "pg-footer" in name or "gutenberg-footer" in name: return SECTION_ROLE_FOOTER if not spine_entry.linear: return SECTION_ROLE_AUXILIARY return SECTION_ROLE_BODY def _zip_dirname(zip_path: str) -> str: normalized = zip_path.replace("\\", "/") if "/" not in normalized: return "" return normalized.rsplit("/", 1)[0] def _join_zip_path(base: str, href: str) -> str: base = base.replace("\\", "/").strip("/") href = href.replace("\\", "/").lstrip("/") if not base or base == ".": return href return f"{base}/{href}" def _split_document( document: _SourceDocument, *, max_words: int, overlap_words: int, ) -> list[tuple[str, str, list[str]]]: text = document.markdown.strip() heading = _markdown_title(text) or document.title or "Source" body_with_markers = re.sub(r"(?m)^# .+?\n+", "", text, count=1).strip() clean_body, anchor_positions = _extract_anchor_positions(body_with_markers) words = clean_body.split() if max_words <= 0 or len(words) <= max_words: anchors = [name for name, _idx in anchor_positions] return [(document.title, _compose_chunk(heading, clean_body), anchors)] overlap = max(0, min(overlap_words, max_words - 1)) step = max_words - overlap if overlap > 0 else max_words parts: list[tuple[str, str, list[str]]] = [] start = 0 while start < len(words): end = min(start + max_words, len(words)) slice_words = words[start:end] if not slice_words: break part_index = len(parts) + 1 part_text = " ".join(slice_words).strip() part_anchors = _anchors_in_range(anchor_positions, start, end) part_title = f"{document.title} Part {part_index}" parts.append((part_title, _compose_chunk(heading, part_text), part_anchors)) if end >= len(words): break start += step return parts def _compose_chunk(heading: str, body: str) -> str: body = body.strip() if not body: return f"# {heading}\n" return f"# {heading}\n\n{body}\n" def _extract_anchor_positions(text: str) -> tuple[str, list[tuple[str, int]]]: parts: list[str] = [] anchors: list[tuple[str, int]] = [] cursor = 0 for match in ANCHOR_MARKER_RE.finditer(text): prefix = text[cursor : match.start()] parts.append(prefix) word_index = sum(len(part.split()) for part in parts) anchors.append((match.group("anchor"), word_index)) cursor = match.end() parts.append(text[cursor:]) cleaned = re.sub(r"[ \t]{2,}", " ", "".join(parts)).strip() return cleaned, anchors def _anchors_in_range( anchor_positions: list[tuple[str, int]], start: int, end: int ) -> list[str]: seen: set[str] = set() found: list[str] = [] for name, idx in anchor_positions: if start <= idx < end and name not in seen: seen.add(name) found.append(name) return found def _html_title(raw: str) -> str: match = HTML_TITLE_RE.search(raw) or HTML_H1_RE.search(raw) if not match: return "" return _collapse_ws(_html_to_text(match.group("title"))) def _html_to_text(raw: str) -> str: cleaned = re.sub(r"<head\b[^>]*>.*?</head>", " ", raw, flags=re.I | re.S) cleaned = SCRIPT_STYLE_RE.sub(" ", cleaned) cleaned = re.sub(r"</(p|div|section|article|h[1-6]|li)>", "\n", cleaned, flags=re.I) cleaned = TAG_RE.sub(" ", cleaned) cleaned = html.unescape(cleaned) lines = [_collapse_ws(line) for line in cleaned.splitlines()] return "\n\n".join(line for line in lines if line).strip() def _ensure_h1(markdown: str, title: str) -> str: if re.search(r"(?m)^#\s+\S", markdown): return markdown return f"# {title}\n\n{markdown.strip()}\n" def _markdown_title(markdown: str) -> str: match = re.search(r"(?m)^#\s+(?P<title>.+?)\s*$", markdown) return match.group("title").strip() if match else "" def _title_from_path(path: Path) -> str: words = re.sub(r"[^A-Za-z0-9]+", " ", path.stem).strip() return words.title() if words else "Source" def _dedupe_chunk_id(base_id: str, used_ids: set[str]) -> str: candidate = base_id or "source" if candidate not in used_ids: used_ids.add(candidate) return candidate index = 2 while f"{candidate}-{index}" in used_ids: index += 1 deduped = f"{candidate}-{index}" used_ids.add(deduped) return deduped def _digest_text(text: str) -> str: return hashlib.sha256(text.encode("utf-8")).hexdigest() def _collapse_ws(value: str) -> str: return re.sub(r"\s+", " ", value).strip() def _normalize_newlines(value: str) -> str: return value.replace("\r\n", "\n").replace("\r", "\n")