generated from coulomb/repo-seed
infospace pipeline for wealth of nations example
This commit is contained in:
127
src/infospace_bench/generation.py
Normal file
127
src/infospace_bench/generation.py
Normal file
@@ -0,0 +1,127 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .errors import InfospaceError
|
||||
from .lifecycle import register_artifact
|
||||
from .semantics import slugify
|
||||
|
||||
|
||||
ENTITY_HEADING_RE = re.compile(r"(?m)^# (?P<title>.+?)\s*$")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EntityBundleItem:
|
||||
title: str
|
||||
slug: str
|
||||
markdown: str
|
||||
|
||||
@property
|
||||
def artifact_id(self) -> str:
|
||||
return f"entity/{self.slug}.md"
|
||||
|
||||
@property
|
||||
def path(self) -> str:
|
||||
return f"artifacts/entities/{self.slug}.md"
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return asdict(self) | {
|
||||
"artifact_id": self.artifact_id,
|
||||
"path": self.path,
|
||||
}
|
||||
|
||||
|
||||
def parse_entity_bundle(markdown: str) -> list[EntityBundleItem]:
|
||||
matches = list(ENTITY_HEADING_RE.finditer(markdown))
|
||||
if not matches:
|
||||
raise InfospaceError(
|
||||
"invalid_entity_bundle",
|
||||
"Entity bundle does not contain any top-level entity headings",
|
||||
{"required_heading": "# <Entity Title>"},
|
||||
)
|
||||
|
||||
items: list[EntityBundleItem] = []
|
||||
seen_slugs: set[str] = set()
|
||||
for index, match in enumerate(matches):
|
||||
end = matches[index + 1].start() if index + 1 < len(matches) else len(markdown)
|
||||
section = markdown[match.start() : end].strip() + "\n"
|
||||
title = match.group("title").strip()
|
||||
slug = slugify(title)
|
||||
if not slug:
|
||||
raise InfospaceError(
|
||||
"invalid_entity_bundle",
|
||||
"Entity bundle contains an empty entity heading",
|
||||
{"title": title},
|
||||
)
|
||||
if slug in seen_slugs:
|
||||
raise InfospaceError(
|
||||
"duplicate_entity_bundle_item",
|
||||
f"Entity bundle contains duplicate entity: {title}",
|
||||
{"slug": slug, "title": title},
|
||||
)
|
||||
if not re.search(r"(?m)^## Definition\s*$", section):
|
||||
raise InfospaceError(
|
||||
"invalid_entity_bundle",
|
||||
f"Entity bundle item is missing a Definition section: {title}",
|
||||
{"slug": slug, "missing_sections": ["definition"]},
|
||||
)
|
||||
seen_slugs.add(slug)
|
||||
items.append(EntityBundleItem(title=title, slug=slug, markdown=section))
|
||||
return items
|
||||
|
||||
|
||||
def write_entity_bundle_artifacts(
|
||||
root: str | Path,
|
||||
markdown: str,
|
||||
*,
|
||||
workflow_id: str,
|
||||
stage_id: str,
|
||||
input_artifact_id: str,
|
||||
source_bundle_artifact_id: str = "",
|
||||
provider: str = "",
|
||||
dry_run: bool = False,
|
||||
) -> list[EntityBundleItem]:
|
||||
items = parse_entity_bundle(markdown)
|
||||
root_path = Path(root)
|
||||
for item in items:
|
||||
if dry_run:
|
||||
continue
|
||||
target = root_path / item.path
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_text(item.markdown, encoding="utf-8")
|
||||
relationships = [
|
||||
{
|
||||
"type": "generated_from",
|
||||
"target": input_artifact_id,
|
||||
}
|
||||
]
|
||||
if source_bundle_artifact_id:
|
||||
relationships.append(
|
||||
{
|
||||
"type": "split_from",
|
||||
"target": source_bundle_artifact_id,
|
||||
}
|
||||
)
|
||||
register_artifact(
|
||||
root_path,
|
||||
artifact_id=item.artifact_id,
|
||||
path=item.path,
|
||||
kind="entity",
|
||||
title=item.title,
|
||||
provenance={
|
||||
"workflow_id": workflow_id,
|
||||
"stage_id": stage_id,
|
||||
"input_artifact_id": input_artifact_id,
|
||||
**(
|
||||
{"source_bundle_artifact_id": source_bundle_artifact_id}
|
||||
if source_bundle_artifact_id
|
||||
else {}
|
||||
),
|
||||
**({"provider": provider} if provider else {}),
|
||||
},
|
||||
relationships=relationships,
|
||||
)
|
||||
return items
|
||||
Reference in New Issue
Block a user