generated from coulomb/repo-seed
Add reference-counted garbage collection
This commit is contained in:
@@ -186,6 +186,34 @@ def retention_sweep() -> None:
|
||||
typer.echo(json.dumps({"marked_package_ids": marked, "marked_count": len(marked)}, indent=2))
|
||||
|
||||
|
||||
@retention_app.command("gc")
|
||||
def retention_gc() -> None:
|
||||
"""Run one garbage-collection pass for deletion-eligible packages."""
|
||||
settings = get_settings()
|
||||
results = asyncio.run(_garbage_collect_async(settings))
|
||||
object_keys = {
|
||||
(r["backend_id"], r["content_address"])
|
||||
for r in results
|
||||
if r["object_delete_attempted"]
|
||||
}
|
||||
deleted_object_keys = {
|
||||
(r["backend_id"], r["content_address"])
|
||||
for r in results
|
||||
if r["object_delete_attempted"] and r["object_deleted"]
|
||||
}
|
||||
typer.echo(
|
||||
json.dumps(
|
||||
{
|
||||
"released_location_count": len(results),
|
||||
"delete_attempted_object_count": len(object_keys),
|
||||
"deleted_object_count": len(deleted_object_keys),
|
||||
"results": results,
|
||||
},
|
||||
indent=2,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@storage_app.command("verify")
|
||||
def storage_verify(
|
||||
backend: str | None = typer.Option(
|
||||
@@ -285,6 +313,30 @@ async def _retention_sweep_async(settings: Settings) -> list[str]:
|
||||
return [str(package_id) for package_id in marked]
|
||||
|
||||
|
||||
async def _garbage_collect_async(settings: Settings) -> list[dict[str, Any]]:
|
||||
from artifactstore.app import build_registry
|
||||
|
||||
registry: Registry = build_registry(settings)
|
||||
try:
|
||||
results = await registry.collect_garbage()
|
||||
finally:
|
||||
await registry.dispose()
|
||||
return [
|
||||
{
|
||||
"storage_location_id": str(result.storage_location_id),
|
||||
"file_id": str(result.file_id),
|
||||
"package_id": str(result.package_id),
|
||||
"backend_id": result.backend_id,
|
||||
"content_address": result.content_address,
|
||||
"object_delete_attempted": result.object_delete_attempted,
|
||||
"object_deleted": result.object_deleted,
|
||||
"ref_count_before": result.ref_count_before,
|
||||
"ref_count_after": result.ref_count_after,
|
||||
}
|
||||
for result in results
|
||||
]
|
||||
|
||||
|
||||
async def _storage_verify_async(
|
||||
settings: Settings,
|
||||
*,
|
||||
|
||||
@@ -23,7 +23,7 @@ from datetime import UTC, datetime
|
||||
from uuid import UUID
|
||||
|
||||
import cbor2
|
||||
from sqlalchemy import delete, insert, update
|
||||
from sqlalchemy import delete, insert, select, update
|
||||
from sqlalchemy.ext.asyncio import AsyncConnection
|
||||
|
||||
from artifactstore.db.schema import (
|
||||
@@ -267,6 +267,47 @@ async def _apply_storage_location_verified(
|
||||
)
|
||||
|
||||
|
||||
async def _apply_storage_location_deleted(
|
||||
connection: AsyncConnection,
|
||||
event: Event,
|
||||
) -> None:
|
||||
if event.subject_id is None:
|
||||
raise ValueError("v1.storage.location_deleted event must have subject_id")
|
||||
payload = cbor2.loads(event.payload)
|
||||
await connection.execute(
|
||||
update(storage_locations)
|
||||
.where(storage_locations.c.id == UUID(payload["storage_location_id"]))
|
||||
.values(
|
||||
status="deleted",
|
||||
restore_status="object_deleted"
|
||||
if payload.get("object_deleted")
|
||||
else "reference_released",
|
||||
)
|
||||
)
|
||||
remaining = (
|
||||
await connection.execute(
|
||||
select(storage_locations.c.id)
|
||||
.join(
|
||||
artifact_files,
|
||||
artifact_files.c.id == storage_locations.c.artifact_file_id,
|
||||
)
|
||||
.where(
|
||||
artifact_files.c.package_id == event.subject_id,
|
||||
storage_locations.c.status != "deleted",
|
||||
)
|
||||
.limit(1)
|
||||
)
|
||||
).first()
|
||||
package_values: dict[str, object] = {"last_event_sequence": event.sequence}
|
||||
if remaining is None:
|
||||
package_values["status"] = "garbage_collected"
|
||||
await connection.execute(
|
||||
update(artifact_packages)
|
||||
.where(artifact_packages.c.id == event.subject_id)
|
||||
.values(**package_values)
|
||||
)
|
||||
|
||||
|
||||
def _parse_iso(value: str | None) -> datetime | None:
|
||||
if value is None:
|
||||
return None
|
||||
@@ -286,4 +327,5 @@ _HANDLERS = {
|
||||
"v1.retention.hold_released": _apply_retention_hold_released,
|
||||
"v1.retention.deletion_eligible": _apply_retention_deletion_eligible,
|
||||
"v1.storage.location_verified": _apply_storage_location_verified,
|
||||
"v1.storage.location_deleted": _apply_storage_location_deleted,
|
||||
}
|
||||
|
||||
@@ -70,6 +70,7 @@ __all__ = [
|
||||
"DuplicateRelativePathError",
|
||||
"FileNotFoundError",
|
||||
"FileRecord",
|
||||
"GarbageCollectionRecord",
|
||||
"IllegalPackageStateError",
|
||||
"MetadataSchemaRecord",
|
||||
"PackageNotFoundError",
|
||||
@@ -183,6 +184,21 @@ class StorageVerificationRecord:
|
||||
mismatch: str | None
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class GarbageCollectionRecord:
|
||||
"""Result of releasing one storage location during garbage collection."""
|
||||
|
||||
storage_location_id: UUID
|
||||
file_id: UUID
|
||||
package_id: UUID
|
||||
backend_id: str
|
||||
content_address: str
|
||||
object_delete_attempted: bool
|
||||
object_deleted: bool
|
||||
ref_count_before: int
|
||||
ref_count_after: int
|
||||
|
||||
|
||||
_RETENTION_EVENT_TYPES = (
|
||||
"v1.retention.default_applied",
|
||||
"v1.retention.extended",
|
||||
@@ -785,6 +801,7 @@ class Registry:
|
||||
stmt = select(storage_locations)
|
||||
if backend_id is not None:
|
||||
stmt = stmt.where(storage_locations.c.backend_id == backend_id)
|
||||
stmt = stmt.where(storage_locations.c.status != "deleted")
|
||||
async with self._engine.connect() as conn:
|
||||
rows = (await conn.execute(stmt.order_by(storage_locations.c.id))).all()
|
||||
|
||||
@@ -842,6 +859,109 @@ class Registry:
|
||||
)
|
||||
return results
|
||||
|
||||
async def collect_garbage(
|
||||
self,
|
||||
*,
|
||||
actor: str = "garbage-collector",
|
||||
) -> list[GarbageCollectionRecord]:
|
||||
"""Release deletion-eligible storage locations and delete unreferenced bytes."""
|
||||
stmt = (
|
||||
select(
|
||||
artifact_files.c.package_id,
|
||||
artifact_files.c.id.label("file_id"),
|
||||
storage_locations.c.id.label("storage_location_id"),
|
||||
storage_locations.c.backend_id,
|
||||
storage_locations.c.content_address,
|
||||
)
|
||||
.join(
|
||||
artifact_files,
|
||||
artifact_files.c.id == storage_locations.c.artifact_file_id,
|
||||
)
|
||||
.join(
|
||||
retention_state,
|
||||
retention_state.c.package_id == artifact_files.c.package_id,
|
||||
)
|
||||
.where(
|
||||
retention_state.c.eligible_for_deletion.is_(True),
|
||||
retention_state.c.active_hold_id.is_(None),
|
||||
storage_locations.c.status != "deleted",
|
||||
)
|
||||
.order_by(
|
||||
artifact_files.c.package_id,
|
||||
artifact_files.c.relative_path,
|
||||
storage_locations.c.id,
|
||||
)
|
||||
)
|
||||
async with self._engine.connect() as conn:
|
||||
rows = (await conn.execute(stmt)).all()
|
||||
|
||||
groups: dict[tuple[str, str], list[Any]] = {}
|
||||
for row in rows:
|
||||
groups.setdefault((row.backend_id, row.content_address), []).append(row)
|
||||
|
||||
results: list[GarbageCollectionRecord] = []
|
||||
for (backend_id, content_address), group_rows in groups.items():
|
||||
async with self._engine.connect() as conn:
|
||||
active_refs = (
|
||||
await conn.execute(
|
||||
select(storage_locations.c.id).where(
|
||||
storage_locations.c.backend_id == backend_id,
|
||||
storage_locations.c.content_address == content_address,
|
||||
storage_locations.c.status != "deleted",
|
||||
)
|
||||
)
|
||||
).all()
|
||||
ref_count_before = len(active_refs)
|
||||
ref_count_after = max(ref_count_before - len(group_rows), 0)
|
||||
object_delete_attempted = ref_count_after == 0
|
||||
object_deleted = False
|
||||
if object_delete_attempted:
|
||||
deletion = await self._dataplane.delete_object(
|
||||
ContentAddress(content_address),
|
||||
backend_id=backend_id,
|
||||
)
|
||||
object_deleted = deletion.deleted
|
||||
|
||||
for row in group_rows:
|
||||
payload = cbor2.dumps(
|
||||
{
|
||||
"storage_location_id": str(row.storage_location_id),
|
||||
"file_id": str(row.file_id),
|
||||
"package_id": str(row.package_id),
|
||||
"backend_id": backend_id,
|
||||
"content_address": content_address,
|
||||
"object_delete_attempted": object_delete_attempted,
|
||||
"object_deleted": object_deleted,
|
||||
"ref_count_before": ref_count_before,
|
||||
"ref_count_after": ref_count_after,
|
||||
},
|
||||
canonical=True,
|
||||
)
|
||||
event = make_event(
|
||||
event_type="v1.storage.location_deleted",
|
||||
subject_kind="package",
|
||||
subject_id=row.package_id,
|
||||
actor=actor,
|
||||
payload=payload,
|
||||
)
|
||||
async with self._engine.begin() as conn:
|
||||
written = await write(conn, event)
|
||||
await self._view_writer.apply(conn, written)
|
||||
results.append(
|
||||
GarbageCollectionRecord(
|
||||
storage_location_id=row.storage_location_id,
|
||||
file_id=row.file_id,
|
||||
package_id=row.package_id,
|
||||
backend_id=backend_id,
|
||||
content_address=content_address,
|
||||
object_delete_attempted=object_delete_attempted,
|
||||
object_deleted=object_deleted,
|
||||
ref_count_before=ref_count_before,
|
||||
ref_count_after=ref_count_after,
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
async def get_manifest_bytes(self, package_id: UUID, *, format: str = "cbor") -> bytes:
|
||||
"""Return the finalised manifest. ``format`` is ``cbor`` (canonical
|
||||
CBOR, the wire form) or ``json`` (the JCS projection)."""
|
||||
@@ -874,6 +994,8 @@ class Registry:
|
||||
) -> AsyncIterator[bytes]:
|
||||
"""Return an async byte iterator for the bytes of a stored file."""
|
||||
record = await self.get_file_metadata(file_id)
|
||||
if record.storage_status == "deleted":
|
||||
raise FileNotFoundError(f"file has been garbage collected: {file_id}")
|
||||
ca = ContentAddress(record.content_address)
|
||||
return await self._dataplane.serve_object(
|
||||
ca,
|
||||
|
||||
Reference in New Issue
Block a user