Initial schemas and validation with extension workplan

This commit is contained in:
2026-05-03 22:12:46 +02:00
parent b96b1fb745
commit 8c9129c371
15 changed files with 1025 additions and 2 deletions

View File

@@ -9,6 +9,14 @@ from markitect_tool.core import (
parse_markdown,
parse_markdown_file,
)
from markitect_tool.schema import (
MarkdownSchema,
SchemaValidationResult,
ValidationViolation,
load_schema_file,
validate_document,
validate_markdown_file,
)
__all__ = [
"ContentBlock",
@@ -18,4 +26,10 @@ __all__ = [
"Section",
"parse_markdown",
"parse_markdown_file",
"MarkdownSchema",
"SchemaValidationResult",
"ValidationViolation",
"load_schema_file",
"validate_document",
"validate_markdown_file",
]

View File

@@ -9,6 +9,7 @@ import click
import yaml
from markitect_tool.core import parse_markdown_file
from markitect_tool.schema import load_schema_file, validate_markdown_file, validate_schema
@click.group()
@@ -40,5 +41,66 @@ def parse(file: Path, output_format: str) -> None:
click.echo(json.dumps(data, indent=2, ensure_ascii=False))
@main.command()
@click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path))
@click.option(
"--schema",
"schema_file",
required=True,
type=click.Path(exists=True, dir_okay=False, path_type=Path),
)
@click.option(
"--format",
"output_format",
type=click.Choice(["json", "yaml", "text"], case_sensitive=False),
default="text",
show_default=True,
)
def validate(file: Path, schema_file: Path, output_format: str) -> None:
"""Validate a Markdown file against a Markdown schema file."""
result = validate_markdown_file(file, schema_file)
_emit_result(result.to_dict(), output_format)
raise click.exceptions.Exit(0 if result.valid else 1)
@main.group()
def schema() -> None:
"""Work with Markdown schema files."""
@schema.command("validate")
@click.argument("schema_file", type=click.Path(exists=True, dir_okay=False, path_type=Path))
@click.option(
"--format",
"output_format",
type=click.Choice(["json", "yaml", "text"], case_sensitive=False),
default="text",
show_default=True,
)
def schema_validate(schema_file: Path, output_format: str) -> None:
"""Validate that a Markdown schema contains a well-formed JSON Schema."""
loaded = load_schema_file(schema_file)
result = validate_schema(loaded.schema)
data = result.to_dict() | {"schema_path": str(schema_file)}
_emit_result(data, output_format)
raise click.exceptions.Exit(0 if result.valid else 1)
def _emit_result(data: dict, output_format: str) -> None:
if output_format == "json":
click.echo(json.dumps(data, indent=2, ensure_ascii=False))
elif output_format == "yaml":
click.echo(yaml.safe_dump(data, sort_keys=False))
else:
if data.get("valid"):
click.echo("valid")
else:
click.echo("invalid")
for violation in data.get("violations", []):
click.echo(f"- {violation['path']}: {violation['message']}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,31 @@
"""Schema loading and validation for structured Markdown documents."""
from markitect_tool.schema.loader import (
InvalidSchemaFormatError,
MarkdownSchema,
SchemaLoaderError,
SchemaNotFoundError,
load_schema_file,
load_schema_text,
)
from markitect_tool.schema.validator import (
SchemaValidationResult,
ValidationViolation,
validate_document,
validate_markdown_file,
validate_schema,
)
__all__ = [
"InvalidSchemaFormatError",
"MarkdownSchema",
"SchemaLoaderError",
"SchemaNotFoundError",
"SchemaValidationResult",
"ValidationViolation",
"load_schema_file",
"load_schema_text",
"validate_document",
"validate_markdown_file",
"validate_schema",
]

View File

@@ -0,0 +1,124 @@
"""Load JSON Schema definitions embedded in Markdown schema files."""
from __future__ import annotations
import json
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any
import yaml
class SchemaLoaderError(ValueError):
"""Base error raised for schema loading failures."""
class SchemaNotFoundError(SchemaLoaderError):
"""Raised when no JSON schema block can be found."""
class InvalidSchemaFormatError(SchemaLoaderError):
"""Raised when a schema block exists but is not valid JSON object data."""
@dataclass(frozen=True)
class MarkdownSchema:
"""A JSON Schema loaded from a Markdown schema document."""
schema: dict[str, Any]
metadata: dict[str, Any]
documentation: str
source_path: str | None = None
def to_dict(self) -> dict[str, Any]:
data = {
"schema": self.schema,
"metadata": self.metadata,
"documentation": self.documentation,
"source_path": self.source_path,
}
return {key: value for key, value in data.items() if value is not None}
_JSON_BLOCK_RE = re.compile(r"```json\s*(.*?)```", re.DOTALL | re.IGNORECASE)
def load_schema_file(path: str | Path) -> MarkdownSchema:
"""Load a Markdown schema file."""
schema_path = Path(path)
if not schema_path.exists():
raise FileNotFoundError(f"Schema file not found: {schema_path}")
return load_schema_text(schema_path.read_text(encoding="utf-8"), source_path=str(schema_path))
def load_schema_text(text: str, source_path: str | None = None) -> MarkdownSchema:
"""Load a Markdown schema document from text."""
metadata, documentation = _split_frontmatter(text)
schema = _extract_json_schema(documentation)
schema = dict(schema)
schema.setdefault(
"x-markitect-source",
{
"format": "markdown",
"file": source_path,
"frontmatter": metadata,
},
)
return MarkdownSchema(
schema=schema,
metadata=metadata,
documentation=documentation,
source_path=source_path,
)
def _split_frontmatter(text: str) -> tuple[dict[str, Any], str]:
if not text.startswith("---\n"):
return {}, text
end = text.find("\n---", 4)
if end == -1:
return {}, text
closing_end = text.find("\n", end + 4)
if closing_end == -1:
closing_end = len(text)
else:
closing_end += 1
raw = text[4:end]
try:
metadata = yaml.safe_load(raw) if raw.strip() else {}
except yaml.YAMLError as exc:
raise InvalidSchemaFormatError(f"Invalid schema frontmatter: {exc}") from exc
if metadata is None:
metadata = {}
if not isinstance(metadata, dict):
raise InvalidSchemaFormatError("Schema frontmatter must be a mapping")
return metadata, text[closing_end:]
def _extract_json_schema(text: str) -> dict[str, Any]:
candidates = list(_JSON_BLOCK_RE.finditer(text))
if not candidates:
raise SchemaNotFoundError("No JSON schema found in markdown schema")
parsed_blocks: list[dict[str, Any]] = []
for match in candidates:
raw_json = match.group(1).strip()
try:
data = json.loads(raw_json)
except json.JSONDecodeError as exc:
raise InvalidSchemaFormatError(f"Invalid JSON schema block: {exc}") from exc
if not isinstance(data, dict):
raise InvalidSchemaFormatError("JSON schema block must contain an object")
parsed_blocks.append(data)
for data in parsed_blocks:
if "$schema" in data or "type" in data:
return data
return parsed_blocks[0]

View File

@@ -0,0 +1,110 @@
"""Validate parsed Markdown documents against JSON Schema."""
from __future__ import annotations
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any
from jsonschema import Draft202012Validator, SchemaError, ValidationError
from markitect_tool.core import Document, parse_markdown_file
from markitect_tool.schema.loader import MarkdownSchema, load_schema_file
@dataclass(frozen=True)
class ValidationViolation:
"""A single schema validation violation."""
path: str
message: str
schema_path: str
def to_dict(self) -> dict[str, str]:
return asdict(self)
@dataclass(frozen=True)
class SchemaValidationResult:
"""Validation result for one document and one schema."""
valid: bool
violations: list[ValidationViolation]
document_path: str | None = None
schema_path: str | None = None
def to_dict(self) -> dict[str, Any]:
data = {
"valid": self.valid,
"violations": [violation.to_dict() for violation in self.violations],
"document_path": self.document_path,
"schema_path": self.schema_path,
}
return {key: value for key, value in data.items() if value is not None}
def validate_schema(schema: dict[str, Any]) -> SchemaValidationResult:
"""Validate that a JSON Schema itself is well formed."""
try:
Draft202012Validator.check_schema(schema)
except SchemaError as exc:
return SchemaValidationResult(
valid=False,
violations=[
ValidationViolation(
path=_format_path(exc.path),
message=exc.message,
schema_path=_format_path(exc.schema_path),
)
],
)
return SchemaValidationResult(valid=True, violations=[])
def validate_markdown_file(
markdown_path: str | Path, schema_path: str | Path
) -> SchemaValidationResult:
"""Parse and validate a Markdown file against a Markdown schema file."""
document = parse_markdown_file(markdown_path)
loaded_schema = load_schema_file(schema_path)
return validate_document(document, loaded_schema)
def validate_document(
document: Document, schema: MarkdownSchema | dict[str, Any]
) -> SchemaValidationResult:
"""Validate a parsed document against a loaded or raw JSON Schema."""
raw_schema = schema.schema if isinstance(schema, MarkdownSchema) else schema
schema_path = schema.source_path if isinstance(schema, MarkdownSchema) else None
schema_check = validate_schema(raw_schema)
if not schema_check.valid:
return SchemaValidationResult(
valid=False,
violations=schema_check.violations,
document_path=document.source_path,
schema_path=schema_path,
)
validator = Draft202012Validator(raw_schema)
violations = [
ValidationViolation(
path=_format_path(error.path),
message=error.message,
schema_path=_format_path(error.schema_path),
)
for error in sorted(validator.iter_errors(document.to_dict()), key=str)
]
return SchemaValidationResult(
valid=not violations,
violations=violations,
document_path=document.source_path,
schema_path=schema_path,
)
def _format_path(path: Any) -> str:
parts = [str(part) for part in path]
return "$" if not parts else "$." + ".".join(parts)