Implement HTTP ingestion and retention lifecycle

This commit is contained in:
2026-05-16 23:10:21 +02:00
parent 2173f702c1
commit c33baa3635
15 changed files with 2478 additions and 69 deletions

View File

@@ -2,8 +2,10 @@
from __future__ import annotations
import asyncio
import json
from pathlib import Path
from typing import Any
import pytest
from sqlalchemy import create_engine, insert, inspect
@@ -84,3 +86,169 @@ def test_cli_health_reports_ok(
assert payload["status"] == "ok"
assert payload["db"]["healthy"] is True
assert payload["backend"]["healthy"] is True
def test_cli_push_uses_http_api(
runner: CliRunner,
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
source = tmp_path / "source"
source.mkdir()
(source / "a.txt").write_text("alpha", encoding="utf-8")
calls: list[tuple[str, str, dict[str, Any]]] = []
multipart_calls: list[tuple[str, dict[str, str], bytes]] = []
def fake_http_json(
method: str,
base_url: str,
path: str,
token: str,
payload: dict[str, Any],
) -> dict[str, Any]:
calls.append((method, path, payload))
assert base_url == "http://api.test"
assert token == "secret"
if path == "/packages":
return {"id": "pkg-1"}
if path == "/packages/pkg-1/finalize":
return {"manifest_digest": "blake3:abc"}
raise AssertionError(f"unexpected JSON request: {method} {path}")
def fake_http_multipart(
base_url: str,
path: str,
token: str,
*,
fields: dict[str, str],
file_field: str,
file_name: str,
file_content_type: str,
file_bytes: bytes,
) -> dict[str, Any]:
assert base_url == "http://api.test"
assert token == "secret"
assert path == "/packages/pkg-1/files"
assert file_field == "file"
assert file_name == "a.txt"
assert file_content_type == "text/plain"
multipart_calls.append((path, fields, file_bytes))
return {"id": "file-1"}
monkeypatch.setattr("artifactstore.cli._http_json", fake_http_json)
monkeypatch.setattr("artifactstore.cli._http_multipart", fake_http_multipart)
result = runner.invoke(
cli_app,
[
"push",
str(source),
"--producer",
"prod",
"--subject",
"sub",
"--api-url",
"http://api.test",
"--token",
"secret",
],
)
assert result.exit_code == 0, result.output
assert json.loads(result.output) == {
"package_id": "pkg-1",
"manifest_digest": "blake3:abc",
"files": 1,
}
assert calls[0][1] == "/packages"
assert calls[1][1] == "/packages/pkg-1/finalize"
assert multipart_calls == [
(
"/packages/pkg-1/files",
{"relative_path": "a.txt", "media_type": "text/plain"},
b"alpha",
)
]
def test_cli_manifest_fetches_json_projection(
runner: CliRunner,
monkeypatch: pytest.MonkeyPatch,
) -> None:
def fake_http_bytes(
method: str,
base_url: str,
path: str,
token: str,
*,
body: bytes | None = None,
headers: dict[str, str] | None = None,
) -> bytes:
assert method == "GET"
assert base_url == "http://api.test"
assert path == "/packages/pkg-1/manifest.json"
assert token == "secret"
assert body is None
assert headers == {"Accept": "application/json"}
return b'{"manifest_version":1}'
monkeypatch.setattr("artifactstore.cli._http_bytes", fake_http_bytes)
result = runner.invoke(
cli_app,
[
"manifest",
"pkg-1",
"--api-url",
"http://api.test",
"--token",
"secret",
],
)
assert result.exit_code == 0, result.output
assert json.loads(result.output) == {"manifest_version": 1}
def test_cli_retention_sweep_marks_expired_package(
runner: CliRunner,
env_db: Path,
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
sync_engine = create_engine(f"sqlite:///{env_db}", future=True)
metadata.create_all(sync_engine)
with sync_engine.begin() as conn:
conn.execute(insert(retention_classes), [dict(s) for s in RETENTION_CLASS_SEEDS])
sync_engine.dispose()
retention_config = tmp_path / "retention.toml"
retention_config.write_text(
'[retention_classes.transient]\ndefault_duration_seconds = 0\n',
encoding="utf-8",
)
monkeypatch.setenv("ARTIFACTSTORE_RETENTION_CONFIG_PATH", str(retention_config))
async def create_expired_package() -> str:
from artifactstore.app import build_registry
from artifactstore.config import get_settings
registry = build_registry(get_settings())
try:
package_id = await registry.create_package(
name="expired",
producer="tests",
subject="cli-sweep",
retention_class="transient",
actor="ops",
)
finally:
await registry.dispose()
return str(package_id)
package_id = asyncio.run(create_expired_package())
result = runner.invoke(cli_app, ["retention", "sweep"])
assert result.exit_code == 0, result.output
payload = json.loads(result.output)
assert payload == {"marked_package_ids": [package_id], "marked_count": 1}

View File

@@ -0,0 +1,240 @@
"""HTTP API integration tests for ARTIFACT-STORE-WP-0002."""
from __future__ import annotations
import tempfile
from pathlib import Path
from typing import Any
import cbor2
from fastapi.testclient import TestClient
from hypothesis import HealthCheck, given
from hypothesis import settings as hypothesis_settings
from hypothesis import strategies as st
from sqlalchemy import create_engine, insert
from artifactstore.api.http import create_app
from artifactstore.config import Settings
from artifactstore.db.schema import metadata, retention_classes
from artifactstore.db.seed import RETENTION_CLASS_SEEDS
from artifactstore.identity import digest_bytes
AUTH = {"Authorization": "Bearer test-token"}
def _settings(root: Path) -> Settings:
db_path = root / "http-api.db"
storage_root = root / "storage"
storage_root.mkdir(parents=True, exist_ok=True)
sync_engine = create_engine(f"sqlite:///{db_path}", future=True)
metadata.create_all(sync_engine)
with sync_engine.begin() as conn:
conn.execute(insert(retention_classes), [dict(s) for s in RETENTION_CLASS_SEEDS])
sync_engine.dispose()
return Settings(
database_url=f"sqlite+aiosqlite:///{db_path}",
storage_local_root=str(storage_root),
log_level="INFO",
auth_tokens="test-token",
)
def _create_package(client: TestClient, *, name: str = "pkg") -> str:
resp = client.post(
"/packages",
headers=AUTH,
json={
"name": name,
"producer": "guide-board",
"subject": "run-42",
"retention_class": "raw-evidence",
"metadata": {"run_id": "r-42", "kind": "integration"},
},
)
assert resp.status_code == 201, resp.text
return str(resp.json()["id"])
def _upload_file(client: TestClient, package_id: str, rel_path: str, data: bytes) -> dict[str, Any]:
resp = client.post(
f"/packages/{package_id}/files",
headers=AUTH,
data={"relative_path": rel_path, "media_type": "application/octet-stream"},
files={"file": (Path(rel_path).name, data, "application/octet-stream")},
)
assert resp.status_code == 201, resp.text
return dict(resp.json())
def test_http_surface_ingest_finalize_download_and_events(tmp_path: Path) -> None:
app = create_app(_settings(tmp_path))
with TestClient(app) as client:
unauth = client.get("/packages")
assert unauth.status_code == 401
assert unauth.headers["content-type"].startswith("application/problem+json")
assert client.get("/openapi.json").status_code == 200
assert client.get("/docs").status_code == 200
assert client.get("/backends", headers=AUTH).json()["backends"][0]["backend_id"] == "local"
assert client.get("/retention-classes", headers=AUTH).json()["retention_classes"]
package_id = _create_package(client)
listing = client.get(
"/packages",
headers=AUTH,
params={
"producer": "guide-board",
"subject": "run-42",
"retention_class": "raw-evidence",
"metadata_key": "run_id",
"metadata_value": "r-42",
},
)
assert listing.status_code == 200
assert [p["id"] for p in listing.json()["packages"]] == [package_id]
data = b"hello artifact-store http api" * 64
file_record = _upload_file(client, package_id, "reports/hello.bin", data)
assert file_record["size_bytes"] == len(data)
assert file_record["digest_primary_hex"] == digest_bytes(data).primary.hex
finalized = client.post(f"/packages/{package_id}/finalize", headers=AUTH)
assert finalized.status_code == 200, finalized.text
assert finalized.json()["status"] == "finalized"
assert finalized.json()["manifest_digest"].startswith("blake3:")
manifest_cbor = client.get(
f"/packages/{package_id}/manifest",
headers={**AUTH, "Accept": "application/cbor"},
)
assert manifest_cbor.status_code == 200
manifest_payload = cbor2.loads(manifest_cbor.content)
assert manifest_payload["manifest_version"] == 1
assert manifest_payload["package"]["id"] == package_id
manifest_json = client.get(f"/packages/{package_id}/manifest.json", headers=AUTH)
assert manifest_json.status_code == 200
assert manifest_json.json()["files"][0]["relative_path"] == "reports/hello.bin"
file_id = file_record["id"]
metadata_resp = client.get(f"/files/{file_id}", headers=AUTH)
assert metadata_resp.status_code == 200
content_address = metadata_resp.json()["content_address"]
download = client.get(f"/files/{file_id}/download", headers=AUTH)
assert download.status_code == 200
assert download.content == data
assert download.headers["etag"] == f'"{content_address}"'
partial = client.get(
f"/files/{file_id}/download",
headers={**AUTH, "Range": "bytes=6-17"},
)
assert partial.status_code == 206
assert partial.headers["content-range"] == f"bytes 6-17/{len(data)}"
assert partial.content == data[6:18]
not_modified = client.get(
f"/files/{file_id}/download",
headers={**AUTH, "If-None-Match": f'"{content_address}"'},
)
assert not_modified.status_code == 304
events_json = client.get(
"/events",
headers={**AUTH, "Accept": "application/json"},
params={"since": 0, "limit": 10, "wait_seconds": 0},
)
assert events_json.status_code == 200
assert [e["event_type"] for e in events_json.json()["events"]] == [
"v1.package.created",
"v1.retention.default_applied",
"v1.file.ingested",
"v1.package.finalized",
]
events_cbor = client.get(
"/events",
headers={**AUTH, "Accept": "application/cbor"},
params={"since": 0, "limit": 10, "wait_seconds": 0},
)
assert events_cbor.status_code == 200
assert cbor2.loads(events_cbor.content)["events"][0]["sequence"] == 1
def test_http_scripted_50_file_package_flow(tmp_path: Path) -> None:
app = create_app(_settings(tmp_path))
with TestClient(app) as client:
package_id = _create_package(client, name="fifty")
uploaded: list[tuple[str, bytes, dict[str, Any]]] = []
for idx in range(50):
rel_path = f"bundle/file-{idx:02d}.bin"
payload = f"payload {idx:02d}:".encode() + bytes([idx]) * (idx + 1)
record = _upload_file(client, package_id, rel_path, payload)
uploaded.append((rel_path, payload, record))
finalized = client.post(f"/packages/{package_id}/finalize", headers=AUTH)
assert finalized.status_code == 200, finalized.text
for rel_path, payload, record in uploaded:
assert record["relative_path"] == rel_path
assert record["digest_primary_hex"] == digest_bytes(payload).primary.hex
downloaded = client.get(f"/files/{record['id']}/download", headers=AUTH)
assert downloaded.status_code == 200
assert downloaded.content == payload
events = client.get(
"/events",
headers={**AUTH, "Accept": "application/json"},
params={"since": 0, "limit": 100, "wait_seconds": 0},
)
assert events.status_code == 200
assert len(events.json()["events"]) == 53
assert events.json()["events"][-1]["event_type"] == "v1.package.finalized"
@given(
data=st.binary(min_size=1, max_size=512),
stem=st.text(alphabet=list("abcdefghijklmnopqrstuvwxyz0123456789_-"), min_size=1, max_size=24),
)
@hypothesis_settings(
max_examples=12,
deadline=None,
suppress_health_check=[HealthCheck.function_scoped_fixture],
)
def test_upload_session_lifecycle_property(data: bytes, stem: str) -> None:
with tempfile.TemporaryDirectory() as tmp:
app = create_app(_settings(Path(tmp)))
with TestClient(app) as client:
package_id = _create_package(client, name="upload-session")
opened = client.post(
"/uploads",
headers=AUTH,
json={"expected_size_bytes": len(data), "media_type": "application/octet-stream"},
)
assert opened.status_code == 201, opened.text
upload_url = opened.json()["content_upload_url"]
patched = client.patch(
upload_url,
headers={**AUTH, "Content-Range": f"bytes 0-{len(data) - 1}/{len(data)}"},
content=data,
)
assert patched.status_code == 200, patched.text
assert patched.json()["received_bytes"] == len(data)
completed = client.post(
f"{upload_url}/complete",
headers=AUTH,
json={
"package_id": package_id,
"relative_path": f"uploads/{stem}.bin",
"media_type": "application/octet-stream",
},
)
assert completed.status_code == 201, completed.text
file_id = completed.json()["id"]
downloaded = client.get(f"/files/{file_id}/download", headers=AUTH)
assert downloaded.status_code == 200
assert downloaded.content == data

View File

@@ -243,18 +243,19 @@ async def test_end_to_end_ingest_finalize_replay(
stream = await registry.get_file(fid)
assert await _consume(stream) == expected
# Tail events: 1 created + 3 ingested + 1 finalized = 5.
# Tail events: 1 created + 1 default retention + 3 ingested + 1 finalized = 6.
collected = []
async def _consume_tail() -> None:
async for evt in registry.tail_events(since_sequence=0, poll_interval_seconds=0.01):
collected.append(evt)
if len(collected) >= 5:
if len(collected) >= 6:
break
await asyncio.wait_for(_consume_tail(), timeout=5.0)
assert [e.event_type for e in collected] == [
"v1.package.created",
"v1.retention.default_applied",
"v1.file.ingested",
"v1.file.ingested",
"v1.file.ingested",

View File

@@ -0,0 +1,250 @@
"""Retention lifecycle integration tests for ARTIFACT-STORE-WP-0003."""
from __future__ import annotations
from collections.abc import AsyncIterator
from datetime import datetime, timedelta
from pathlib import Path
import cbor2
import pytest
import pytest_asyncio
from fastapi.testclient import TestClient
from sqlalchemy import insert
from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine
from artifactstore.api.http import create_app
from artifactstore.config import Settings
from artifactstore.dataplane import InProcessDataPlane
from artifactstore.db.schema import metadata, retention_classes
from artifactstore.db.seed import RETENTION_CLASS_SEEDS
from artifactstore.events import RegistryViewWriter
from artifactstore.registry import Registry, RetentionStateError
from artifactstore.retention import RetentionPolicy
from artifactstore.storage import LocalBackend
AUTH = {"Authorization": "Bearer test-token"}
@pytest_asyncio.fixture
async def engine(tmp_path: Path) -> AsyncIterator[AsyncEngine]:
db_path = tmp_path / "retention.db"
eng = create_async_engine(f"sqlite+aiosqlite:///{db_path}")
async with eng.begin() as conn:
await conn.run_sync(metadata.create_all)
for seed in RETENTION_CLASS_SEEDS:
await conn.execute(insert(retention_classes).values(**seed))
yield eng
await eng.dispose()
@pytest.fixture
def registry(engine: AsyncEngine, tmp_path: Path) -> Registry:
backend = LocalBackend(tmp_path / "store", backend_id="local")
dataplane = InProcessDataPlane(backend, tmp_dir=tmp_path / "dp-tmp")
return Registry(
engine,
dataplane,
RegistryViewWriter(),
RetentionPolicy({"transient": 0}),
)
def _http_settings(tmp_path: Path, retention_config_path: Path | None = None) -> Settings:
db_path = tmp_path / "retention-http.db"
storage_root = tmp_path / "storage"
storage_root.mkdir(parents=True, exist_ok=True)
from sqlalchemy import create_engine
sync_engine = create_engine(f"sqlite:///{db_path}", future=True)
metadata.create_all(sync_engine)
with sync_engine.begin() as conn:
conn.execute(insert(retention_classes), [dict(s) for s in RETENTION_CLASS_SEEDS])
sync_engine.dispose()
return Settings(
database_url=f"sqlite+aiosqlite:///{db_path}",
storage_local_root=str(storage_root),
log_level="INFO",
auth_tokens="test-token",
retention_config_path=str(retention_config_path or ""),
)
def _create_package(
client: TestClient,
*,
retention_class: str = "raw-evidence",
) -> dict[str, object]:
resp = client.post(
"/packages",
headers=AUTH,
json={
"name": "retention",
"producer": "tests",
"subject": "retention-subject",
"retention_class": retention_class,
"metadata": {},
},
)
assert resp.status_code == 201, resp.text
return dict(resp.json())
async def test_default_retention_and_permanent_record(registry: Registry) -> None:
transient_id = await registry.create_package(
name="short",
producer="tests",
subject="transient",
retention_class="transient",
actor="ops",
)
transient_state = await registry.get_retention_state(transient_id)
assert transient_state.current_expires_at is not None
assert transient_state.eligible_for_deletion is False
permanent_id = await registry.create_package(
name="forever",
producer="tests",
subject="permanent",
retention_class="permanent-record",
actor="ops",
)
permanent_state = await registry.get_retention_state(permanent_id)
assert permanent_state.current_expires_at is None
assert permanent_state.eligible_for_deletion is False
history = await registry.retention_history(transient_id)
assert [event.event_type for event in history] == ["v1.retention.default_applied"]
assert cbor2.loads(history[0].payload)["default_duration_seconds"] == 0
async def test_retention_extension_requires_later_expiry(registry: Registry) -> None:
package_id = await registry.create_package(
name="extend",
producer="tests",
subject="extension",
retention_class="transient",
actor="ops",
)
current = await registry.get_retention_state(package_id)
assert current.current_expires_at is not None
with pytest.raises(RetentionStateError, match="strictly later"):
await registry.extend_retention(
package_id,
new_expires_at=current.current_expires_at,
reason="not later",
actor="ops",
)
new_expiry = current.current_expires_at + timedelta(days=1)
extended = await registry.extend_retention(
package_id,
new_expires_at=new_expiry,
reason="needed for quarterly review",
actor="ops",
)
assert extended.current_expires_at == new_expiry
history = await registry.retention_history(package_id)
assert [event.event_type for event in history] == [
"v1.retention.default_applied",
"v1.retention.extended",
]
async def test_hold_release_and_sweeper_eligibility_transition(registry: Registry) -> None:
package_id = await registry.create_package(
name="held",
producer="tests",
subject="hold-release",
retention_class="transient",
actor="ops",
)
initial = await registry.get_retention_state(package_id)
assert initial.current_expires_at is not None
after_expiry = initial.current_expires_at + timedelta(seconds=5)
hold_id = await registry.apply_retention_hold(
package_id,
reason="quarterly hold",
actor="ops",
)
held = await registry.get_retention_state(package_id)
assert held.active_hold_id == hold_id
assert await registry.sweep_deletion_eligibility(now=after_expiry) == []
still_held = await registry.get_retention_state(package_id)
assert still_held.eligible_for_deletion is False
released = await registry.release_retention_hold(
package_id,
hold_id,
reason="hold complete",
actor="ops",
now=after_expiry,
)
assert released.active_hold_id is None
assert released.eligible_for_deletion is True
assert await registry.sweep_deletion_eligibility(now=after_expiry) == []
history = await registry.retention_history(package_id)
assert [event.event_type for event in history] == [
"v1.retention.default_applied",
"v1.retention.hold_applied",
"v1.retention.hold_released",
"v1.retention.deletion_eligible",
]
def test_http_retention_controls_and_history_formats(tmp_path: Path) -> None:
app = create_app(_http_settings(tmp_path))
with TestClient(app) as client:
package = _create_package(client)
package_id = str(package["id"])
current_expires_at = datetime.fromisoformat(str(package["expires_at"]))
new_expiry = current_expires_at + timedelta(days=7)
extended = client.post(
f"/packages/{package_id}/retention/extensions",
headers=AUTH,
json={
"new_expires_at": new_expiry.isoformat(),
"reason": "retain for release signoff",
},
)
assert extended.status_code == 200, extended.text
assert extended.json()["current_expires_at"] == new_expiry.isoformat()
hold = client.post(
f"/packages/{package_id}/retention/holds",
headers=AUTH,
json={"reason": "external audit"},
)
assert hold.status_code == 201, hold.text
hold_id = hold.json()["hold_id"]
released = client.post(
f"/packages/{package_id}/retention/holds/{hold_id}/release",
headers=AUTH,
json={"reason": "audit complete"},
)
assert released.status_code == 200, released.text
assert released.json()["active_hold_id"] is None
history_json = client.get(f"/packages/{package_id}/retention/history", headers=AUTH)
assert history_json.status_code == 200
assert [event["event_type"] for event in history_json.json()["events"]] == [
"v1.retention.default_applied",
"v1.retention.extended",
"v1.retention.hold_applied",
"v1.retention.hold_released",
]
history_cbor = client.get(
f"/packages/{package_id}/retention/history",
headers={**AUTH, "Accept": "application/cbor"},
)
assert history_cbor.status_code == 200
assert cbor2.loads(history_cbor.content)["events"][1]["event_type"] == (
"v1.retention.extended"
)