WP-0001-T002: registry data model, Alembic, initial migration with retention seed

Schema (src/artifactstore/db/schema.py):
- events table (ADR-0002 source of truth): sequence BIGSERIAL PK, created_at,
  event_type, subject_kind, subject_id, actor, payload (CBOR bytes),
  payload_digest. Indexes on (subject_kind, subject_id) and
  (event_type, sequence).
- artifact_packages, artifact_files, storage_locations, retention_state
  (materialised views over events).
- retention_classes (seed table) and metadata_schemas (config table).
- ADR-0001 columns present: digest_algorithm, digest_primary, digest_sha256,
  content_address. Blueprint tiering columns present: retrieval_tier
  (default 'hot'), restore_status.
- Types portable: SQLAlchemy 2.0 Core with JSON().with_variant(JSONB, 'postgresql'),
  Uuid, LargeBinary, DateTime(timezone=True), Boolean false() default.

Seed (src/artifactstore/db/seed.py): five v1 retention classes (transient,
raw-evidence, summary-evidence, release-evidence, permanent-record) with
default durations in seconds; permanent-record has no expiry.

Alembic:
- alembic.ini with sync sqlite URL default; path_separator=os to silence the
  1.13 deprecation warning.
- migrations/env.py: translates async URLs (+aiosqlite, +asyncpg) to sync
  counterparts at migrate-time so a single ARTIFACTSTORE_DATABASE_URL works
  for both runtime (async) and Alembic (sync).
- migrations/script.py.mako template.
- migrations/versions/20260516_0001_initial.py: metadata.create_all + bulk
  insert of retention class seeds.

Make:
- make migrate: alembic upgrade head (ensures var/ exists).
- make migrate-fresh: drop local SQLite + re-run.

Deps: psycopg[binary] added as optional `postgres` extra (PostgreSQL prod
path; SQLite default for dev needs no extra).

Tests:
- tests/unit/test_db_schema.py: every expected table present; ADR-0001 and
  tiering columns present; seed has the five v1 classes; permanent-record
  has no default_duration; create_all + FK insert + Boolean default
  round-trip on in-memory SQLite.
- tests/integration/test_migrations.py: alembic upgrade head against a
  tempfile SQLite produces all tables (+ alembic_version) and the seed rows.

Gates: ruff clean, mypy --strict clean on 32 files, 38 tests pass.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-16 01:50:38 +02:00
parent d14ee517d9
commit f8097cb683
14 changed files with 749 additions and 15 deletions

View File

@@ -0,0 +1,12 @@
"""Database schema and engine factory.
The ``schema`` submodule owns the SQLAlchemy Core :class:`MetaData` and
:class:`Table` definitions referenced by both migrations and runtime queries.
``engine`` exposes the async engine factory. ``seed`` holds bootstrap data
applied by the initial migration.
"""
from artifactstore.db import schema, seed
from artifactstore.db.engine import create_engine
__all__ = ["create_engine", "schema", "seed"]

View File

@@ -0,0 +1,12 @@
"""Async SQLAlchemy engine factory."""
from __future__ import annotations
from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine
from artifactstore.config import Settings
def create_engine(settings: Settings) -> AsyncEngine:
"""Construct the runtime async engine from settings."""
return create_async_engine(settings.database_url, echo=False, future=True)

View File

@@ -0,0 +1,160 @@
"""Database schema (ADR-0002 + ARCHITECTURE-BLUEPRINT data model).
All tables are defined via SQLAlchemy Core so the same definitions drive
migrations (Alembic) and runtime queries (registry orchestrator). Types use
the portable SQLAlchemy 2.0 forms; PostgreSQL-specific variants are layered
via :func:`with_variant` where the gain (e.g. ``JSONB`` over ``JSON``) is
meaningful.
The ``events`` table is the source of truth (ADR-0002). The other tables
are materialised views rebuildable from the event log.
"""
from __future__ import annotations
from sqlalchemy import (
JSON,
BigInteger,
Boolean,
Column,
DateTime,
ForeignKey,
Index,
LargeBinary,
MetaData,
String,
Table,
UniqueConstraint,
func,
)
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.sql import false
from sqlalchemy.types import Uuid
metadata = MetaData()
_JSON_TYPE = JSON().with_variant(JSONB(), "postgresql")
events = Table(
"events",
metadata,
Column("sequence", BigInteger, primary_key=True, autoincrement=True),
Column("created_at", DateTime(timezone=True), nullable=False, server_default=func.now()),
Column("event_type", String, nullable=False),
Column("subject_kind", String, nullable=False),
Column("subject_id", Uuid, nullable=True),
Column("actor", String, nullable=False),
Column("payload", LargeBinary, nullable=False),
Column("payload_digest", LargeBinary, nullable=False),
Index("ix_events_subject", "subject_kind", "subject_id"),
Index("ix_events_type_sequence", "event_type", "sequence"),
)
retention_classes = Table(
"retention_classes",
metadata,
Column("class_id", String, primary_key=True),
Column("default_duration_seconds", BigInteger, nullable=True),
Column("deletion_strategy", String, nullable=False),
)
metadata_schemas = Table(
"metadata_schemas",
metadata,
Column("id", Uuid, primary_key=True),
Column("slug", String, nullable=False, unique=True),
Column("json_schema", _JSON_TYPE, nullable=False),
Column("created_at", DateTime(timezone=True), nullable=False, server_default=func.now()),
)
artifact_packages = Table(
"artifact_packages",
metadata,
Column("id", Uuid, primary_key=True),
Column("name", String, nullable=False),
Column("producer", String, nullable=False),
Column("subject", String, nullable=False),
Column(
"retention_class",
String,
ForeignKey("retention_classes.class_id"),
nullable=False,
),
Column(
"metadata_schema_id",
Uuid,
ForeignKey("metadata_schemas.id"),
nullable=True,
),
Column("metadata", _JSON_TYPE, nullable=False),
Column("status", String, nullable=False),
Column("manifest_digest", LargeBinary, nullable=True),
Column("created_at", DateTime(timezone=True), nullable=False, server_default=func.now()),
Column("finalized_at", DateTime(timezone=True), nullable=True),
Column("expires_at", DateTime(timezone=True), nullable=True),
Column("last_event_sequence", BigInteger, nullable=False),
)
artifact_files = Table(
"artifact_files",
metadata,
Column("id", Uuid, primary_key=True),
Column(
"package_id",
Uuid,
ForeignKey("artifact_packages.id"),
nullable=False,
),
Column("relative_path", String, nullable=False),
Column("media_type", String, nullable=False),
Column("size_bytes", BigInteger, nullable=False),
Column("digest_algorithm", String, nullable=False),
Column("digest_primary", LargeBinary, nullable=False),
Column("digest_sha256", LargeBinary, nullable=False),
Column("created_at", DateTime(timezone=True), nullable=False, server_default=func.now()),
UniqueConstraint("package_id", "relative_path", name="uq_artifact_files_pkg_path"),
)
storage_locations = Table(
"storage_locations",
metadata,
Column("id", Uuid, primary_key=True),
Column(
"artifact_file_id",
Uuid,
ForeignKey("artifact_files.id"),
nullable=False,
),
Column("backend_id", String, nullable=False),
Column("content_address", String, nullable=False),
Column("object_key", String, nullable=False),
Column("storage_class", String, nullable=True),
Column("retrieval_tier", String, nullable=False, server_default="hot"),
Column("restore_status", String, nullable=True),
Column("status", String, nullable=False),
Column("created_at", DateTime(timezone=True), nullable=False, server_default=func.now()),
Column("last_verified_at", DateTime(timezone=True), nullable=True),
Index("ix_storage_locations_content_address", "content_address"),
)
retention_state = Table(
"retention_state",
metadata,
Column(
"package_id",
Uuid,
ForeignKey("artifact_packages.id"),
primary_key=True,
),
Column("current_expires_at", DateTime(timezone=True), nullable=True),
Column("effective_class", String, nullable=False),
Column("active_hold_id", Uuid, nullable=True),
Column("eligible_for_deletion", Boolean, nullable=False, server_default=false()),
)

View File

@@ -0,0 +1,52 @@
"""Bootstrap seed data applied by the initial migration.
The :data:`RETENTION_CLASS_SEEDS` entries match the five v1 retention classes
listed in ``docs/ARCHITECTURE-BLUEPRINT.md``. Default durations are intended
to be overridable by an operator configuration file (WP-0003); the seed
values only ensure the registry has sensible defaults on a fresh DB.
"""
from __future__ import annotations
from typing import TypedDict
class RetentionClassSeed(TypedDict):
class_id: str
default_duration_seconds: int | None
deletion_strategy: str
_ONE_DAY = 86_400
_NINETY_DAYS = 90 * _ONE_DAY
_ONE_YEAR = 365 * _ONE_DAY
_SEVEN_YEARS = 7 * _ONE_YEAR
RETENTION_CLASS_SEEDS: tuple[RetentionClassSeed, ...] = (
{
"class_id": "transient",
"default_duration_seconds": _ONE_DAY,
"deletion_strategy": "mark_eligible",
},
{
"class_id": "raw-evidence",
"default_duration_seconds": _NINETY_DAYS,
"deletion_strategy": "mark_eligible",
},
{
"class_id": "summary-evidence",
"default_duration_seconds": _ONE_YEAR,
"deletion_strategy": "mark_eligible",
},
{
"class_id": "release-evidence",
"default_duration_seconds": _SEVEN_YEARS,
"deletion_strategy": "mark_eligible",
},
{
"class_id": "permanent-record",
"default_duration_seconds": None,
"deletion_strategy": "mark_eligible",
},
)