From ef5d2f9a4d96ecad923aeae6b128d81566925322 Mon Sep 17 00:00:00 2001 From: didericis Date: Mon, 25 May 2026 07:05:23 -0400 Subject: [PATCH] feat(state): preserve on crash + always snapshot transcript MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends the preserve-on-capability-block design to also preserve state on agent crash, and snapshots the transcript on every teardown so any resume (crash or capability-block) gets a warm claude session — not a cold start. - capability_apply: rename _snapshot_transcript → snapshot_transcript (public; reused below). No behavior change in the capability path. - cli/start.py: capture bottle.exec_claude's exit code; while the container is still alive (inside the launch context): * always snapshot_transcript(identity) * if exit_code != 0, mark_preserved(identity) Then the existing _settle_state runs after teardown. Now the preservation matrix is: exit 0 (clean) → snapshot + cleanup state exit ≠0 (crash, Ctrl-C) → snapshot + preserve + show resume hint capability-block → (already snapshotted/preserved by apply before teardown; this path is a no-op because the container is already gone by the time exec_claude returns) snapshot_transcript is best-effort — capability-block's earlier snapshot is not clobbered when the container is already torn down, and a missing /home/node/.claude is a warn + skip. Tested behavior: clean exit doesn't preserve, non-zero exit (including SIGINT/130 and SIGKILL/137) preserves; empty identity no-ops both helpers. Co-Authored-By: Claude Opus 4.7 --- .../backend/docker/capability_apply.py | 18 +++- claude_bottle/cli/start.py | 45 +++++++-- tests/unit/test_capability_apply.py | 6 +- tests/unit/test_cli_start_settle.py | 93 +++++++++++++++++++ 4 files changed, 144 insertions(+), 18 deletions(-) create mode 100644 tests/unit/test_cli_start_settle.py diff --git a/claude_bottle/backend/docker/capability_apply.py b/claude_bottle/backend/docker/capability_apply.py index 878d1a1..1a8b96a 100644 --- a/claude_bottle/backend/docker/capability_apply.py +++ b/claude_bottle/backend/docker/capability_apply.py @@ -115,7 +115,7 @@ def apply_capability_change(slug: str, new_dockerfile: str) -> tuple[str, str]: raise CapabilityApplyError("proposed Dockerfile is empty") before = fetch_current_dockerfile(slug) - _snapshot_transcript(slug) + snapshot_transcript(slug) _push_working_tree(slug) write_per_bottle_dockerfile(slug, new_dockerfile) # Set the preserve marker BEFORE teardown so cli.py's session-end @@ -139,12 +139,19 @@ def _repo_dockerfile_path() -> Path: return Path(__file__).resolve().parent.parent.parent.parent / "Dockerfile" -def _snapshot_transcript(slug: str) -> None: +def snapshot_transcript(slug: str) -> None: """`docker cp` /home/node/.claude out of the agent container into ~/.claude-bottle/state//transcript/. Best-effort: missing container, missing dir, or cp error all log a warning and return. The transcript is what `claude --resume` reads to pick up where - the agent left off.""" + the agent left off. + + Called from two places: + - capability-apply, before tearing the bottle down. + - cli.py's session-end path, before the launch context closes, + so a crash or normal exit also leaves a transcript on disk + (deleted along with the state dir on clean exit, kept on + crash or capability-block per the preserve marker).""" container = _agent_container_name(slug) dest = transcript_snapshot_dir(slug) if dest.exists(): @@ -157,11 +164,11 @@ def _snapshot_transcript(slug: str) -> None: ) if r.returncode != 0: warn( - f"capability-apply: transcript snapshot skipped " + f"transcript snapshot skipped " f"({(r.stderr or '').strip() or 'no transcript dir in container?'})" ) return - info(f"capability-apply: transcript snapshotted to {dest}") + info(f"transcript snapshotted to {dest}") def _push_working_tree(slug: str) -> None: @@ -213,4 +220,5 @@ __all__ = [ "CapabilityApplyError", "apply_capability_change", "fetch_current_dockerfile", + "snapshot_transcript", ] diff --git a/claude_bottle/cli/start.py b/claude_bottle/cli/start.py index 0d82c99..45b5c74 100644 --- a/claude_bottle/cli/start.py +++ b/claude_bottle/cli/start.py @@ -17,7 +17,12 @@ import tempfile from pathlib import Path from ..backend import BottleSpec, get_bottle_backend -from ..backend.docker.bottle_state import cleanup_state, is_preserved +from ..backend.docker.bottle_state import ( + cleanup_state, + is_preserved, + mark_preserved, +) +from ..backend.docker.capability_apply import snapshot_transcript from ..log import die, info from ..manifest import Manifest from ._common import PROG, USER_CWD, read_tty_line @@ -98,22 +103,42 @@ def _launch_bottle( claude_args = ["--dangerously-skip-permissions"] if remote_control: claude_args.append("--remote-control") - bottle.exec_claude(claude_args, tty=True) - info(f"session ended; container {bottle.name} will be removed") + exit_code = bottle.exec_claude(claude_args, tty=True) + info( + f"session ended (exit {exit_code}); " + f"container {bottle.name} will be removed" + ) + # While the container is still alive: always snapshot the + # transcript and — if the agent exited non-zero — mark + # the state for preservation. Capability-block already + # did both before triggering teardown from the dashboard; + # this picks up crashes / Ctrl-Cs / OOM kills the same + # way. snapshot_transcript is best-effort so the + # capability-block path's prior snapshot isn't clobbered + # when the container is already gone. + _capture_session_state(identity, exit_code) # Context exited → containers + networks gone. Now decide - # what to do with the per-bottle state dir on the host: - # capability-block apply sets the preserve marker before it - # tears the bottle down, so the operator can resume from the - # new Dockerfile + transcript snapshot. Any other session - # end (normal exit, agent crash, Ctrl-C) leaves no marker, - # and the state dir gets cleaned up so ~/.claude-bottle/state/ - # doesn't accumulate per-launch debris. + # what to do with the per-bottle state dir on the host: any + # preserve marker (capability-block OR crash) keeps it; a + # clean exit cleans it up so ~/.claude-bottle/state/ doesn't + # accumulate per-launch debris. _settle_state(identity) return 0 finally: shutil.rmtree(stage_dir, ignore_errors=True) +def _capture_session_state(identity: str, exit_code: int) -> None: + """Inside the launch context, while the container is still + alive: snapshot the transcript and mark for preservation if + claude crashed. Pure-function-ish; tests stub the helpers.""" + if not identity: + return + snapshot_transcript(identity) + if exit_code != 0: + mark_preserved(identity) + + def _settle_state(identity: str) -> None: if not identity: return diff --git a/tests/unit/test_capability_apply.py b/tests/unit/test_capability_apply.py index 60bd139..bcef7fb 100644 --- a/tests/unit/test_capability_apply.py +++ b/tests/unit/test_capability_apply.py @@ -63,7 +63,7 @@ class TestApplyCapabilityChange(_FakeHomeMixin, unittest.TestCase): # job is to sequence write + snapshot + push + teardown; we # validate that sequence here, not the docker primitives. self._calls: list[str] = [] - self._orig_snapshot = capability_apply._snapshot_transcript + self._orig_snapshot = capability_apply.snapshot_transcript self._orig_push = capability_apply._push_working_tree self._orig_teardown = capability_apply._teardown_bottle @@ -76,12 +76,12 @@ class TestApplyCapabilityChange(_FakeHomeMixin, unittest.TestCase): def stub_teardown(slug): self._calls.append(f"teardown:{slug}") - capability_apply._snapshot_transcript = stub_snapshot # type: ignore[assignment] + capability_apply.snapshot_transcript = stub_snapshot # type: ignore[assignment] capability_apply._push_working_tree = stub_push # type: ignore[assignment] capability_apply._teardown_bottle = stub_teardown # type: ignore[assignment] def tearDown(self): - capability_apply._snapshot_transcript = self._orig_snapshot # type: ignore[assignment] + capability_apply.snapshot_transcript = self._orig_snapshot # type: ignore[assignment] capability_apply._push_working_tree = self._orig_push # type: ignore[assignment] capability_apply._teardown_bottle = self._orig_teardown # type: ignore[assignment] self._teardown_fake_home() diff --git a/tests/unit/test_cli_start_settle.py b/tests/unit/test_cli_start_settle.py new file mode 100644 index 0000000..f6fad89 --- /dev/null +++ b/tests/unit/test_cli_start_settle.py @@ -0,0 +1,93 @@ +"""Unit: cli/start.py session-end state capture (crash preservation). + +The launch-context machinery is covered by integration; this isolates +the post-exec_claude decision: snapshot transcript + mark for +preservation if non-zero exit, no-op for clean exit.""" + +import tempfile +import unittest +from pathlib import Path + +from claude_bottle import supervise +from claude_bottle.backend.docker import bottle_state +from claude_bottle.cli import start as start_mod + + +class _FakeHomeMixin: + def _setup_fake_home(self): + self._tmp = tempfile.TemporaryDirectory(prefix="cli-start-settle.") + self._original = supervise.claude_bottle_root + + def fake_root() -> Path: + return Path(self._tmp.name) / ".claude-bottle" + + supervise.claude_bottle_root = fake_root # type: ignore[assignment] + + def _teardown_fake_home(self): + supervise.claude_bottle_root = self._original # type: ignore[assignment] + self._tmp.cleanup() + + +class TestCaptureSessionState(_FakeHomeMixin, unittest.TestCase): + def setUp(self): + self._setup_fake_home() + # Stub the docker-dependent snapshot call so this stays a + # unit test. apply_capability_change's integration test + # covers the real docker cp path. + self._snap_calls: list[str] = [] + self._orig_snap = start_mod.snapshot_transcript + start_mod.snapshot_transcript = lambda identity: ( + self._snap_calls.append(identity) + ) + + def tearDown(self): + start_mod.snapshot_transcript = self._orig_snap + self._teardown_fake_home() + + def test_clean_exit_snapshots_but_does_not_mark(self): + start_mod._capture_session_state("dev-abc", exit_code=0) + self.assertEqual(["dev-abc"], self._snap_calls) + self.assertFalse(bottle_state.is_preserved("dev-abc")) + + def test_crash_snapshots_and_marks(self): + start_mod._capture_session_state("dev-abc", exit_code=137) + self.assertEqual(["dev-abc"], self._snap_calls) + self.assertTrue(bottle_state.is_preserved("dev-abc")) + + def test_ctrl_c_treated_as_crash(self): + # SIGINT delivers exit 130; the operator may have Ctrl-C'd + # because something went wrong, so we preserve. + start_mod._capture_session_state("dev-abc", exit_code=130) + self.assertTrue(bottle_state.is_preserved("dev-abc")) + + def test_empty_identity_is_noop(self): + # Backends without an identity field shouldn't crash this + # path (the _identity_from_plan helper falls back to ""). + start_mod._capture_session_state("", exit_code=137) + self.assertEqual([], self._snap_calls) + + +class TestSettleState(_FakeHomeMixin, unittest.TestCase): + def setUp(self): + self._setup_fake_home() + + def tearDown(self): + self._teardown_fake_home() + + def test_preserved_state_survives(self): + bottle_state.write_per_bottle_dockerfile("dev-abc", "FROM x\n") + bottle_state.mark_preserved("dev-abc") + start_mod._settle_state("dev-abc") + self.assertTrue(bottle_state.bottle_state_dir("dev-abc").is_dir()) + + def test_unpreserved_state_is_cleaned(self): + bottle_state.write_per_bottle_dockerfile("dev-abc", "FROM x\n") + start_mod._settle_state("dev-abc") + self.assertFalse(bottle_state.bottle_state_dir("dev-abc").exists()) + + def test_empty_identity_is_noop(self): + start_mod._settle_state("") # should not raise + + +if __name__ == "__main__": + unittest.main()