Files
infospace-bench/src/infospace_bench/generation.py

128 lines
3.8 KiB
Python

from __future__ import annotations
import re
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any
from .errors import InfospaceError
from .lifecycle import register_artifact
from .semantics import slugify
ENTITY_HEADING_RE = re.compile(r"(?m)^# (?P<title>.+?)\s*$")
@dataclass(frozen=True)
class EntityBundleItem:
title: str
slug: str
markdown: str
@property
def artifact_id(self) -> str:
return f"entity/{self.slug}.md"
@property
def path(self) -> str:
return f"artifacts/entities/{self.slug}.md"
def to_dict(self) -> dict[str, Any]:
return asdict(self) | {
"artifact_id": self.artifact_id,
"path": self.path,
}
def parse_entity_bundle(markdown: str) -> list[EntityBundleItem]:
matches = list(ENTITY_HEADING_RE.finditer(markdown))
if not matches:
raise InfospaceError(
"invalid_entity_bundle",
"Entity bundle does not contain any top-level entity headings",
{"required_heading": "# <Entity Title>"},
)
items: list[EntityBundleItem] = []
seen_slugs: set[str] = set()
for index, match in enumerate(matches):
end = matches[index + 1].start() if index + 1 < len(matches) else len(markdown)
section = markdown[match.start() : end].strip() + "\n"
title = match.group("title").strip()
slug = slugify(title)
if not slug:
raise InfospaceError(
"invalid_entity_bundle",
"Entity bundle contains an empty entity heading",
{"title": title},
)
if slug in seen_slugs:
raise InfospaceError(
"duplicate_entity_bundle_item",
f"Entity bundle contains duplicate entity: {title}",
{"slug": slug, "title": title},
)
if not re.search(r"(?m)^## Definition\s*$", section):
raise InfospaceError(
"invalid_entity_bundle",
f"Entity bundle item is missing a Definition section: {title}",
{"slug": slug, "missing_sections": ["definition"]},
)
seen_slugs.add(slug)
items.append(EntityBundleItem(title=title, slug=slug, markdown=section))
return items
def write_entity_bundle_artifacts(
root: str | Path,
markdown: str,
*,
workflow_id: str,
stage_id: str,
input_artifact_id: str,
source_bundle_artifact_id: str = "",
provider: str = "",
dry_run: bool = False,
) -> list[EntityBundleItem]:
items = parse_entity_bundle(markdown)
root_path = Path(root)
for item in items:
if dry_run:
continue
target = root_path / item.path
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(item.markdown, encoding="utf-8")
relationships = [
{
"type": "generated_from",
"target": input_artifact_id,
}
]
if source_bundle_artifact_id:
relationships.append(
{
"type": "split_from",
"target": source_bundle_artifact_id,
}
)
register_artifact(
root_path,
artifact_id=item.artifact_id,
path=item.path,
kind="entity",
title=item.title,
provenance={
"workflow_id": workflow_id,
"stage_id": stage_id,
"input_artifact_id": input_artifact_id,
**(
{"source_bundle_artifact_id": source_bundle_artifact_id}
if source_bundle_artifact_id
else {}
),
**({"provider": provider} if provider else {}),
},
relationships=relationships,
)
return items