feat(state): preserve on crash + always snapshot transcript
test / unit (pull_request) Successful in 17s
test / integration (pull_request) Successful in 1m31s

Extends the preserve-on-capability-block design to also preserve
state on agent crash, and snapshots the transcript on every
teardown so any resume (crash or capability-block) gets a warm
claude session — not a cold start.

- capability_apply: rename _snapshot_transcript → snapshot_transcript
  (public; reused below). No behavior change in the capability path.
- cli/start.py: capture bottle.exec_claude's exit code; while the
  container is still alive (inside the launch context):
    * always snapshot_transcript(identity)
    * if exit_code != 0, mark_preserved(identity)
  Then the existing _settle_state runs after teardown.

Now the preservation matrix is:

  exit 0   (clean)          → snapshot + cleanup state
  exit ≠0  (crash, Ctrl-C)  → snapshot + preserve + show resume hint
  capability-block          → (already snapshotted/preserved by apply
                               before teardown; this path is a no-op
                               because the container is already gone
                               by the time exec_claude returns)

snapshot_transcript is best-effort — capability-block's earlier
snapshot is not clobbered when the container is already torn down,
and a missing /home/node/.claude is a warn + skip.

Tested behavior: clean exit doesn't preserve, non-zero exit
(including SIGINT/130 and SIGKILL/137) preserves; empty identity
no-ops both helpers.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-25 07:05:23 -04:00
parent fb2b5844c4
commit ef5d2f9a4d
4 changed files with 144 additions and 18 deletions
@@ -115,7 +115,7 @@ def apply_capability_change(slug: str, new_dockerfile: str) -> tuple[str, str]:
raise CapabilityApplyError("proposed Dockerfile is empty")
before = fetch_current_dockerfile(slug)
_snapshot_transcript(slug)
snapshot_transcript(slug)
_push_working_tree(slug)
write_per_bottle_dockerfile(slug, new_dockerfile)
# Set the preserve marker BEFORE teardown so cli.py's session-end
@@ -139,12 +139,19 @@ def _repo_dockerfile_path() -> Path:
return Path(__file__).resolve().parent.parent.parent.parent / "Dockerfile"
def _snapshot_transcript(slug: str) -> None:
def snapshot_transcript(slug: str) -> None:
"""`docker cp` /home/node/.claude out of the agent container into
~/.claude-bottle/state/<slug>/transcript/. Best-effort: missing
container, missing dir, or cp error all log a warning and return.
The transcript is what `claude --resume` reads to pick up where
the agent left off."""
the agent left off.
Called from two places:
- capability-apply, before tearing the bottle down.
- cli.py's session-end path, before the launch context closes,
so a crash or normal exit also leaves a transcript on disk
(deleted along with the state dir on clean exit, kept on
crash or capability-block per the preserve marker)."""
container = _agent_container_name(slug)
dest = transcript_snapshot_dir(slug)
if dest.exists():
@@ -157,11 +164,11 @@ def _snapshot_transcript(slug: str) -> None:
)
if r.returncode != 0:
warn(
f"capability-apply: transcript snapshot skipped "
f"transcript snapshot skipped "
f"({(r.stderr or '').strip() or 'no transcript dir in container?'})"
)
return
info(f"capability-apply: transcript snapshotted to {dest}")
info(f"transcript snapshotted to {dest}")
def _push_working_tree(slug: str) -> None:
@@ -213,4 +220,5 @@ __all__ = [
"CapabilityApplyError",
"apply_capability_change",
"fetch_current_dockerfile",
"snapshot_transcript",
]
+35 -10
View File
@@ -17,7 +17,12 @@ import tempfile
from pathlib import Path
from ..backend import BottleSpec, get_bottle_backend
from ..backend.docker.bottle_state import cleanup_state, is_preserved
from ..backend.docker.bottle_state import (
cleanup_state,
is_preserved,
mark_preserved,
)
from ..backend.docker.capability_apply import snapshot_transcript
from ..log import die, info
from ..manifest import Manifest
from ._common import PROG, USER_CWD, read_tty_line
@@ -98,22 +103,42 @@ def _launch_bottle(
claude_args = ["--dangerously-skip-permissions"]
if remote_control:
claude_args.append("--remote-control")
bottle.exec_claude(claude_args, tty=True)
info(f"session ended; container {bottle.name} will be removed")
exit_code = bottle.exec_claude(claude_args, tty=True)
info(
f"session ended (exit {exit_code}); "
f"container {bottle.name} will be removed"
)
# While the container is still alive: always snapshot the
# transcript and — if the agent exited non-zero — mark
# the state for preservation. Capability-block already
# did both before triggering teardown from the dashboard;
# this picks up crashes / Ctrl-Cs / OOM kills the same
# way. snapshot_transcript is best-effort so the
# capability-block path's prior snapshot isn't clobbered
# when the container is already gone.
_capture_session_state(identity, exit_code)
# Context exited → containers + networks gone. Now decide
# what to do with the per-bottle state dir on the host:
# capability-block apply sets the preserve marker before it
# tears the bottle down, so the operator can resume from the
# new Dockerfile + transcript snapshot. Any other session
# end (normal exit, agent crash, Ctrl-C) leaves no marker,
# and the state dir gets cleaned up so ~/.claude-bottle/state/
# doesn't accumulate per-launch debris.
# what to do with the per-bottle state dir on the host: any
# preserve marker (capability-block OR crash) keeps it; a
# clean exit cleans it up so ~/.claude-bottle/state/ doesn't
# accumulate per-launch debris.
_settle_state(identity)
return 0
finally:
shutil.rmtree(stage_dir, ignore_errors=True)
def _capture_session_state(identity: str, exit_code: int) -> None:
"""Inside the launch context, while the container is still
alive: snapshot the transcript and mark for preservation if
claude crashed. Pure-function-ish; tests stub the helpers."""
if not identity:
return
snapshot_transcript(identity)
if exit_code != 0:
mark_preserved(identity)
def _settle_state(identity: str) -> None:
if not identity:
return
+3 -3
View File
@@ -63,7 +63,7 @@ class TestApplyCapabilityChange(_FakeHomeMixin, unittest.TestCase):
# job is to sequence write + snapshot + push + teardown; we
# validate that sequence here, not the docker primitives.
self._calls: list[str] = []
self._orig_snapshot = capability_apply._snapshot_transcript
self._orig_snapshot = capability_apply.snapshot_transcript
self._orig_push = capability_apply._push_working_tree
self._orig_teardown = capability_apply._teardown_bottle
@@ -76,12 +76,12 @@ class TestApplyCapabilityChange(_FakeHomeMixin, unittest.TestCase):
def stub_teardown(slug):
self._calls.append(f"teardown:{slug}")
capability_apply._snapshot_transcript = stub_snapshot # type: ignore[assignment]
capability_apply.snapshot_transcript = stub_snapshot # type: ignore[assignment]
capability_apply._push_working_tree = stub_push # type: ignore[assignment]
capability_apply._teardown_bottle = stub_teardown # type: ignore[assignment]
def tearDown(self):
capability_apply._snapshot_transcript = self._orig_snapshot # type: ignore[assignment]
capability_apply.snapshot_transcript = self._orig_snapshot # type: ignore[assignment]
capability_apply._push_working_tree = self._orig_push # type: ignore[assignment]
capability_apply._teardown_bottle = self._orig_teardown # type: ignore[assignment]
self._teardown_fake_home()
+93
View File
@@ -0,0 +1,93 @@
"""Unit: cli/start.py session-end state capture (crash preservation).
The launch-context machinery is covered by integration; this isolates
the post-exec_claude decision: snapshot transcript + mark for
preservation if non-zero exit, no-op for clean exit."""
import tempfile
import unittest
from pathlib import Path
from claude_bottle import supervise
from claude_bottle.backend.docker import bottle_state
from claude_bottle.cli import start as start_mod
class _FakeHomeMixin:
def _setup_fake_home(self):
self._tmp = tempfile.TemporaryDirectory(prefix="cli-start-settle.")
self._original = supervise.claude_bottle_root
def fake_root() -> Path:
return Path(self._tmp.name) / ".claude-bottle"
supervise.claude_bottle_root = fake_root # type: ignore[assignment]
def _teardown_fake_home(self):
supervise.claude_bottle_root = self._original # type: ignore[assignment]
self._tmp.cleanup()
class TestCaptureSessionState(_FakeHomeMixin, unittest.TestCase):
def setUp(self):
self._setup_fake_home()
# Stub the docker-dependent snapshot call so this stays a
# unit test. apply_capability_change's integration test
# covers the real docker cp path.
self._snap_calls: list[str] = []
self._orig_snap = start_mod.snapshot_transcript
start_mod.snapshot_transcript = lambda identity: (
self._snap_calls.append(identity)
)
def tearDown(self):
start_mod.snapshot_transcript = self._orig_snap
self._teardown_fake_home()
def test_clean_exit_snapshots_but_does_not_mark(self):
start_mod._capture_session_state("dev-abc", exit_code=0)
self.assertEqual(["dev-abc"], self._snap_calls)
self.assertFalse(bottle_state.is_preserved("dev-abc"))
def test_crash_snapshots_and_marks(self):
start_mod._capture_session_state("dev-abc", exit_code=137)
self.assertEqual(["dev-abc"], self._snap_calls)
self.assertTrue(bottle_state.is_preserved("dev-abc"))
def test_ctrl_c_treated_as_crash(self):
# SIGINT delivers exit 130; the operator may have Ctrl-C'd
# because something went wrong, so we preserve.
start_mod._capture_session_state("dev-abc", exit_code=130)
self.assertTrue(bottle_state.is_preserved("dev-abc"))
def test_empty_identity_is_noop(self):
# Backends without an identity field shouldn't crash this
# path (the _identity_from_plan helper falls back to "").
start_mod._capture_session_state("", exit_code=137)
self.assertEqual([], self._snap_calls)
class TestSettleState(_FakeHomeMixin, unittest.TestCase):
def setUp(self):
self._setup_fake_home()
def tearDown(self):
self._teardown_fake_home()
def test_preserved_state_survives(self):
bottle_state.write_per_bottle_dockerfile("dev-abc", "FROM x\n")
bottle_state.mark_preserved("dev-abc")
start_mod._settle_state("dev-abc")
self.assertTrue(bottle_state.bottle_state_dir("dev-abc").is_dir())
def test_unpreserved_state_is_cleaned(self):
bottle_state.write_per_bottle_dockerfile("dev-abc", "FROM x\n")
start_mod._settle_state("dev-abc")
self.assertFalse(bottle_state.bottle_state_dir("dev-abc").exists())
def test_empty_identity_is_noop(self):
start_mod._settle_state("") # should not raise
if __name__ == "__main__":
unittest.main()