feat(state): clean up per-bottle state on session end (except capability-block) #26

Merged
didericis merged 3 commits from state-cleanup-on-close into main 2026-05-25 07:07:54 -04:00
9 changed files with 459 additions and 24 deletions
@@ -45,6 +45,10 @@ _STATE_SUBDIR = "state"
_PER_BOTTLE_DOCKERFILE_NAME = "Dockerfile"
_TRANSCRIPT_SUBDIR = "transcript"
_METADATA_NAME = "metadata.json"
# Empty marker file. capability_apply writes it before teardown so
# cli.py's session-end cleanup knows to preserve the state dir for
# `cli.py resume <identity>`. Absent = clean up.
_PRESERVE_MARKER = ".preserve"
# 5 chars of base36 alphabet ≈ 60M combinations. Plenty for human
# operators starting bottles by hand; collision-free in practice.
@@ -155,14 +159,61 @@ def transcript_snapshot_dir(identity: str) -> Path:
return bottle_state_dir(identity) / _TRANSCRIPT_SUBDIR
# --- Preserve-on-close marker ----------------------------------------------
def preserve_marker_path(identity: str) -> Path:
return bottle_state_dir(identity) / _PRESERVE_MARKER
def mark_preserved(identity: str) -> Path:
"""Mark this bottle's state for preservation across session
teardown. Written by capability_apply.apply_capability_change so
cli.py's session-end cleanup leaves the state dir intact for a
subsequent `cli.py resume`."""
path = preserve_marker_path(identity)
path.parent.mkdir(parents=True, exist_ok=True)
path.touch()
return path
def is_preserved(identity: str) -> bool:
return preserve_marker_path(identity).exists()
def clear_preserve_marker(identity: str) -> None:
"""Idempotent removal. Called at fresh launch (start or resume)
so a marker left from a prior capability-block doesn't keep
state alive past the next normal session-end."""
try:
preserve_marker_path(identity).unlink()
except FileNotFoundError:
pass
def cleanup_state(identity: str) -> None:
"""Remove the per-bottle state dir entirely. Called by cli.py
when a bottle session ends and is_preserved(identity) is False.
Idempotent — missing dir is success."""
import shutil
state_dir = bottle_state_dir(identity)
if state_dir.is_dir():
shutil.rmtree(state_dir, ignore_errors=True)
__all__ = [
"BottleMetadata",
"bottle_identity",
"bottle_state_dir",
"cleanup_state",
"clear_preserve_marker",
"is_preserved",
"mark_preserved",
"metadata_path",
"per_bottle_dockerfile",
"per_bottle_dockerfile_path",
"per_bottle_image_tag",
"preserve_marker_path",
"read_metadata",
"transcript_snapshot_dir",
"write_metadata",
@@ -37,6 +37,7 @@ from pathlib import Path
from ...log import info, warn
from .bottle_state import (
mark_preserved,
per_bottle_dockerfile,
per_bottle_dockerfile_path,
transcript_snapshot_dir,
@@ -114,9 +115,14 @@ def apply_capability_change(slug: str, new_dockerfile: str) -> tuple[str, str]:
raise CapabilityApplyError("proposed Dockerfile is empty")
before = fetch_current_dockerfile(slug)
_snapshot_transcript(slug)
snapshot_transcript(slug)
_push_working_tree(slug)
write_per_bottle_dockerfile(slug, new_dockerfile)
# Set the preserve marker BEFORE teardown so cli.py's session-end
# cleanup sees it and keeps the state dir intact for the
# operator's `cli.py resume <identity>`. Without the marker the
# state dir would be deleted as part of normal session end.
mark_preserved(slug)
_teardown_bottle(slug)
return before, new_dockerfile
@@ -133,12 +139,19 @@ def _repo_dockerfile_path() -> Path:
return Path(__file__).resolve().parent.parent.parent.parent / "Dockerfile"
def _snapshot_transcript(slug: str) -> None:
def snapshot_transcript(slug: str) -> None:
"""`docker cp` /home/node/.claude out of the agent container into
~/.claude-bottle/state/<slug>/transcript/. Best-effort: missing
container, missing dir, or cp error all log a warning and return.
The transcript is what `claude --resume` reads to pick up where
the agent left off."""
the agent left off.
Called from two places:
- capability-apply, before tearing the bottle down.
- cli.py's session-end path, before the launch context closes,
so a crash or normal exit also leaves a transcript on disk
(deleted along with the state dir on clean exit, kept on
crash or capability-block per the preserve marker)."""
container = _agent_container_name(slug)
dest = transcript_snapshot_dir(slug)
if dest.exists():
@@ -151,11 +164,11 @@ def _snapshot_transcript(slug: str) -> None:
)
if r.returncode != 0:
warn(
f"capability-apply: transcript snapshot skipped "
f"transcript snapshot skipped "
f"({(r.stderr or '').strip() or 'no transcript dir in container?'})"
)
return
info(f"capability-apply: transcript snapshotted to {dest}")
info(f"transcript snapshotted to {dest}")
def _push_working_tree(slug: str) -> None:
@@ -207,4 +220,5 @@ __all__ = [
"CapabilityApplyError",
"apply_capability_change",
"fetch_current_dockerfile",
"snapshot_transcript",
]
+5
View File
@@ -30,6 +30,7 @@ from .git_gate import DockerGitGate, git_gate_container_name
from .bottle_state import (
BottleMetadata,
bottle_identity,
clear_preserve_marker,
per_bottle_dockerfile,
per_bottle_dockerfile_path,
per_bottle_image_tag,
@@ -73,6 +74,10 @@ def resolve_plan(
copy_cwd=spec.copy_cwd,
started_at=datetime.now(timezone.utc).isoformat(),
))
# Clear any leftover preserve marker from a prior capability-block
# so this fresh launch can be cleaned up at session-end unless
# the agent triggers another capability-block.
clear_preserve_marker(slug)
# PRD 0016 capability-block: if a per-bottle Dockerfile has been
# written (via apply_capability_change), the base image becomes
+77 -11
View File
@@ -1,10 +1,19 @@
"""cleanup: stop and remove all orphaned claude-bottle resources
(containers + networks) left behind by previous bottles."""
(containers + networks) left behind by previous bottles, plus
optionally the per-bottle state dirs under ~/.claude-bottle/state/.
State cleanup is prompted separately from container cleanup because
the trade-off is different: containers + networks are pure debris,
but a state dir may carry a resumable bottle (capability-block
rebuild + transcript snapshot) the operator still wants."""
from __future__ import annotations
import shutil
import sys
from pathlib import Path
from .. import supervise as _supervise
from ..backend import get_bottle_backend
from ..log import info
from ._common import read_tty_line
@@ -13,19 +22,76 @@ from ._common import read_tty_line
def cmd_cleanup(_argv: list[str]) -> int:
backend = get_bottle_backend()
plan = backend.prepare_cleanup()
state_dirs = _enumerate_state_dirs()
if plan.empty:
if plan.empty and not state_dirs:
info("no claude-bottle resources to clean up")
return 0
plan.print()
sys.stderr.write("claude-bottle: remove all of the above? [y/N] ")
if not plan.empty:
plan.print()
if _prompt_yes("remove all of the above?"):
backend.cleanup(plan)
info("containers + networks: cleaned")
else:
info("containers + networks: skipped")
if state_dirs:
_print_state(state_dirs)
if _prompt_yes(
"remove per-bottle state? (loses resumable bottles)",
):
for d in state_dirs:
shutil.rmtree(d, ignore_errors=True)
info(f"state: removed {len(state_dirs)} dir(s)")
else:
info("state: skipped")
return 0
# --- State enumeration + display ------------------------------------------
def _enumerate_state_dirs() -> list[Path]:
"""All per-bottle state dirs under ~/.claude-bottle/state/.
Sorted for stable preflight output."""
state_root = _supervise.claude_bottle_root() / "state"
if not state_root.is_dir():
return []
return sorted(p for p in state_root.iterdir() if p.is_dir())
def _state_summary(path: Path) -> str:
"""One-line description suitable for the cleanup prompt. Calls
out resumability so the operator can decide whether removing it
loses anything they care about."""
flags: list[str] = []
if (path / "metadata.json").is_file():
flags.append("resumable")
else:
flags.append("no metadata.json (orphan)")
if (path / "Dockerfile").is_file():
flags.append("rebuilt Dockerfile")
if (path / "transcript").is_dir():
flags.append("transcript snapshot")
if (path / ".preserve").is_file():
flags.append("preserve marker")
return f"state: {path.name} ({', '.join(flags)})"
def _print_state(dirs: list[Path]) -> None:
print(file=sys.stderr)
for d in dirs:
info(_state_summary(d))
print(file=sys.stderr)
# --- Prompt ----------------------------------------------------------------
def _prompt_yes(message: str) -> bool:
sys.stderr.write(f"claude-bottle: {message} [y/N] ")
sys.stderr.flush()
reply = read_tty_line()
if reply not in ("y", "Y", "yes", "YES"):
info("aborted")
return 0
backend.cleanup(plan)
info("done")
return 0
return reply in ("y", "Y", "yes", "YES")
+47 -5
View File
@@ -17,6 +17,12 @@ import tempfile
from pathlib import Path
from ..backend import BottleSpec, get_bottle_backend
from ..backend.docker.bottle_state import (
cleanup_state,
is_preserved,
mark_preserved,
)
from ..backend.docker.capability_apply import snapshot_transcript
from ..log import die, info
from ..manifest import Manifest
from ._common import PROG, USER_CWD, read_tty_line
@@ -97,15 +103,51 @@ def _launch_bottle(
claude_args = ["--dangerously-skip-permissions"]
if remote_control:
claude_args.append("--remote-control")
bottle.exec_claude(claude_args, tty=True)
info(f"session ended; container {bottle.name} will be removed")
if identity:
info(f"to resume this bottle: ./cli.py resume {identity}")
return 0
exit_code = bottle.exec_claude(claude_args, tty=True)
info(
f"session ended (exit {exit_code}); "
f"container {bottle.name} will be removed"
)
# While the container is still alive: always snapshot the
# transcript and — if the agent exited non-zero — mark
# the state for preservation. Capability-block already
# did both before triggering teardown from the dashboard;
# this picks up crashes / Ctrl-Cs / OOM kills the same
# way. snapshot_transcript is best-effort so the
# capability-block path's prior snapshot isn't clobbered
# when the container is already gone.
_capture_session_state(identity, exit_code)
# Context exited → containers + networks gone. Now decide
# what to do with the per-bottle state dir on the host: any
# preserve marker (capability-block OR crash) keeps it; a
# clean exit cleans it up so ~/.claude-bottle/state/ doesn't
# accumulate per-launch debris.
_settle_state(identity)
return 0
finally:
shutil.rmtree(stage_dir, ignore_errors=True)
def _capture_session_state(identity: str, exit_code: int) -> None:
"""Inside the launch context, while the container is still
alive: snapshot the transcript and mark for preservation if
claude crashed. Pure-function-ish; tests stub the helpers."""
if not identity:
return
snapshot_transcript(identity)
if exit_code != 0:
mark_preserved(identity)
def _settle_state(identity: str) -> None:
if not identity:
return
if is_preserved(identity):
info(f"to resume this bottle: ./cli.py resume {identity}")
return
cleanup_state(identity)
def _identity_from_plan(plan: object) -> str:
"""Backend-specific: the docker plan exposes the identity as
`.slug`. Other backends in the future would expose their own
+54
View File
@@ -114,6 +114,60 @@ class TestBottleIdentity(unittest.TestCase):
self.assertTrue(identity.startswith("my-agent-"))
class TestPreserveMarker(_FakeHomeMixin, unittest.TestCase):
"""The .preserve marker is how capability_apply tells cli.py's
session-end cleanup to keep the state dir instead of removing it."""
def setUp(self):
self._setup_fake_home()
def tearDown(self):
self._teardown_fake_home()
def test_default_is_unpreserved(self):
self.assertFalse(bottle_state.is_preserved("dev-x"))
def test_mark_then_is_preserved(self):
bottle_state.mark_preserved("dev-x")
self.assertTrue(bottle_state.is_preserved("dev-x"))
def test_clear_removes_marker(self):
bottle_state.mark_preserved("dev-x")
bottle_state.clear_preserve_marker("dev-x")
self.assertFalse(bottle_state.is_preserved("dev-x"))
def test_clear_is_idempotent(self):
# No marker present — should not raise.
bottle_state.clear_preserve_marker("never-existed")
self.assertFalse(bottle_state.is_preserved("never-existed"))
def test_marker_path_under_state_dir(self):
path = bottle_state.preserve_marker_path("dev-x")
self.assertTrue(str(path).endswith("/.claude-bottle/state/dev-x/.preserve"))
class TestCleanupState(_FakeHomeMixin, unittest.TestCase):
"""cleanup_state removes the entire per-bottle state dir.
Called by cli.py when a session ends without the preserve marker."""
def setUp(self):
self._setup_fake_home()
def tearDown(self):
self._teardown_fake_home()
def test_removes_state_dir_and_contents(self):
bottle_state.write_per_bottle_dockerfile("dev-x", "FROM x\n")
d = bottle_state.bottle_state_dir("dev-x")
self.assertTrue(d.is_dir())
bottle_state.cleanup_state("dev-x")
self.assertFalse(d.exists())
def test_idempotent_when_dir_missing(self):
# Never created — should not raise.
bottle_state.cleanup_state("never-existed")
class TestBottleMetadata(_FakeHomeMixin, unittest.TestCase):
def setUp(self):
self._setup_fake_home()
+11 -3
View File
@@ -63,7 +63,7 @@ class TestApplyCapabilityChange(_FakeHomeMixin, unittest.TestCase):
# job is to sequence write + snapshot + push + teardown; we
# validate that sequence here, not the docker primitives.
self._calls: list[str] = []
self._orig_snapshot = capability_apply._snapshot_transcript
self._orig_snapshot = capability_apply.snapshot_transcript
self._orig_push = capability_apply._push_working_tree
self._orig_teardown = capability_apply._teardown_bottle
@@ -76,12 +76,12 @@ class TestApplyCapabilityChange(_FakeHomeMixin, unittest.TestCase):
def stub_teardown(slug):
self._calls.append(f"teardown:{slug}")
capability_apply._snapshot_transcript = stub_snapshot # type: ignore[assignment]
capability_apply.snapshot_transcript = stub_snapshot # type: ignore[assignment]
capability_apply._push_working_tree = stub_push # type: ignore[assignment]
capability_apply._teardown_bottle = stub_teardown # type: ignore[assignment]
def tearDown(self):
capability_apply._snapshot_transcript = self._orig_snapshot # type: ignore[assignment]
capability_apply.snapshot_transcript = self._orig_snapshot # type: ignore[assignment]
capability_apply._push_working_tree = self._orig_push # type: ignore[assignment]
capability_apply._teardown_bottle = self._orig_teardown # type: ignore[assignment]
self._teardown_fake_home()
@@ -107,6 +107,14 @@ class TestApplyCapabilityChange(_FakeHomeMixin, unittest.TestCase):
self._calls,
)
def test_marks_preserved_before_teardown(self):
# cli.py's session-end cleanup reads the marker after the
# bottle is torn down. The marker must therefore be written
# before teardown — otherwise the cleanup would see no
# marker and rm the state dir we just populated.
apply_capability_change("dev", "FROM new\n")
self.assertTrue(bottle_state.is_preserved("dev"))
def test_first_change_falls_back_to_repo_dockerfile_for_before(self):
# No per-bottle override yet — before-diff comes from the
# repo's Dockerfile.
+102
View File
@@ -0,0 +1,102 @@
"""Unit: cli/cleanup.py state-dir enumeration + summary.
The end-to-end cleanup-with-prompt flow is exercised manually;
here we cover the state-dir display logic so a regression in the
resumable / orphan / rebuild flags surfaces in unit CI."""
import tempfile
import unittest
from pathlib import Path
from claude_bottle import supervise
from claude_bottle.backend.docker import bottle_state
from claude_bottle.cli.cleanup import _enumerate_state_dirs, _state_summary
class _FakeHomeMixin:
def _setup_fake_home(self):
self._tmp = tempfile.TemporaryDirectory(prefix="cli-cleanup-test.")
original = supervise.claude_bottle_root
def fake_root() -> Path:
return Path(self._tmp.name) / ".claude-bottle"
supervise.claude_bottle_root = fake_root # type: ignore[assignment]
self._restore = lambda: setattr(supervise, "claude_bottle_root", original)
def _teardown_fake_home(self):
self._restore()
self._tmp.cleanup()
class TestEnumerateStateDirs(_FakeHomeMixin, unittest.TestCase):
def setUp(self):
self._setup_fake_home()
def tearDown(self):
self._teardown_fake_home()
def test_empty_when_no_state_root(self):
self.assertEqual([], _enumerate_state_dirs())
def test_lists_each_identity_dir(self):
bottle_state.write_per_bottle_dockerfile("dev-aaa", "FROM x\n")
bottle_state.write_per_bottle_dockerfile("api-bbb", "FROM y\n")
dirs = _enumerate_state_dirs()
self.assertEqual(
["api-bbb", "dev-aaa"],
[p.name for p in dirs],
)
def test_sorted_for_stable_preflight(self):
for name in ("z", "a", "m"):
bottle_state.write_per_bottle_dockerfile(name, "FROM x\n")
names = [p.name for p in _enumerate_state_dirs()]
self.assertEqual(["a", "m", "z"], names)
class TestStateSummary(_FakeHomeMixin, unittest.TestCase):
def setUp(self):
self._setup_fake_home()
def tearDown(self):
self._teardown_fake_home()
def _path(self, name: str) -> Path:
return bottle_state.bottle_state_dir(name)
def test_orphan_state_dir(self):
# Only a Dockerfile, no metadata.json — the "api / dev" shape
# that comes from pre-identity-fix code.
bottle_state.write_per_bottle_dockerfile("orphan", "FROM old\n")
s = _state_summary(self._path("orphan"))
self.assertIn("orphan", s)
self.assertIn("no metadata.json", s)
self.assertIn("rebuilt Dockerfile", s)
def test_resumable_state_dir(self):
bottle_state.write_metadata(bottle_state.BottleMetadata(
identity="dev-aaa", agent_name="dev",
cwd="/proj/A", copy_cwd=True, started_at="t",
))
s = _state_summary(self._path("dev-aaa"))
self.assertIn("resumable", s)
self.assertNotIn("rebuilt Dockerfile", s)
def test_resumable_with_capability_rebuild_and_preserve_marker(self):
bottle_state.write_metadata(bottle_state.BottleMetadata(
identity="dev-bbb", agent_name="dev",
cwd="", copy_cwd=False, started_at="t",
))
bottle_state.write_per_bottle_dockerfile("dev-bbb", "FROM rebuilt\n")
bottle_state.transcript_snapshot_dir("dev-bbb").mkdir(parents=True)
bottle_state.mark_preserved("dev-bbb")
s = _state_summary(self._path("dev-bbb"))
self.assertIn("resumable", s)
self.assertIn("rebuilt Dockerfile", s)
self.assertIn("transcript snapshot", s)
self.assertIn("preserve marker", s)
if __name__ == "__main__":
unittest.main()
+93
View File
@@ -0,0 +1,93 @@
"""Unit: cli/start.py session-end state capture (crash preservation).
The launch-context machinery is covered by integration; this isolates
the post-exec_claude decision: snapshot transcript + mark for
preservation if non-zero exit, no-op for clean exit."""
import tempfile
import unittest
from pathlib import Path
from claude_bottle import supervise
from claude_bottle.backend.docker import bottle_state
from claude_bottle.cli import start as start_mod
class _FakeHomeMixin:
def _setup_fake_home(self):
self._tmp = tempfile.TemporaryDirectory(prefix="cli-start-settle.")
self._original = supervise.claude_bottle_root
def fake_root() -> Path:
return Path(self._tmp.name) / ".claude-bottle"
supervise.claude_bottle_root = fake_root # type: ignore[assignment]
def _teardown_fake_home(self):
supervise.claude_bottle_root = self._original # type: ignore[assignment]
self._tmp.cleanup()
class TestCaptureSessionState(_FakeHomeMixin, unittest.TestCase):
def setUp(self):
self._setup_fake_home()
# Stub the docker-dependent snapshot call so this stays a
# unit test. apply_capability_change's integration test
# covers the real docker cp path.
self._snap_calls: list[str] = []
self._orig_snap = start_mod.snapshot_transcript
start_mod.snapshot_transcript = lambda identity: (
self._snap_calls.append(identity)
)
def tearDown(self):
start_mod.snapshot_transcript = self._orig_snap
self._teardown_fake_home()
def test_clean_exit_snapshots_but_does_not_mark(self):
start_mod._capture_session_state("dev-abc", exit_code=0)
self.assertEqual(["dev-abc"], self._snap_calls)
self.assertFalse(bottle_state.is_preserved("dev-abc"))
def test_crash_snapshots_and_marks(self):
start_mod._capture_session_state("dev-abc", exit_code=137)
self.assertEqual(["dev-abc"], self._snap_calls)
self.assertTrue(bottle_state.is_preserved("dev-abc"))
def test_ctrl_c_treated_as_crash(self):
# SIGINT delivers exit 130; the operator may have Ctrl-C'd
# because something went wrong, so we preserve.
start_mod._capture_session_state("dev-abc", exit_code=130)
self.assertTrue(bottle_state.is_preserved("dev-abc"))
def test_empty_identity_is_noop(self):
# Backends without an identity field shouldn't crash this
# path (the _identity_from_plan helper falls back to "").
start_mod._capture_session_state("", exit_code=137)
self.assertEqual([], self._snap_calls)
class TestSettleState(_FakeHomeMixin, unittest.TestCase):
def setUp(self):
self._setup_fake_home()
def tearDown(self):
self._teardown_fake_home()
def test_preserved_state_survives(self):
bottle_state.write_per_bottle_dockerfile("dev-abc", "FROM x\n")
bottle_state.mark_preserved("dev-abc")
start_mod._settle_state("dev-abc")
self.assertTrue(bottle_state.bottle_state_dir("dev-abc").is_dir())
def test_unpreserved_state_is_cleaned(self):
bottle_state.write_per_bottle_dockerfile("dev-abc", "FROM x\n")
start_mod._settle_state("dev-abc")
self.assertFalse(bottle_state.bottle_state_dir("dev-abc").exists())
def test_empty_identity_is_noop(self):
start_mod._settle_state("") # should not raise
if __name__ == "__main__":
unittest.main()