feat(state): preserve on crash + always snapshot transcript
Extends the preserve-on-capability-block design to also preserve
state on agent crash, and snapshots the transcript on every
teardown so any resume (crash or capability-block) gets a warm
claude session — not a cold start.
- capability_apply: rename _snapshot_transcript → snapshot_transcript
(public; reused below). No behavior change in the capability path.
- cli/start.py: capture bottle.exec_claude's exit code; while the
container is still alive (inside the launch context):
* always snapshot_transcript(identity)
* if exit_code != 0, mark_preserved(identity)
Then the existing _settle_state runs after teardown.
Now the preservation matrix is:
exit 0 (clean) → snapshot + cleanup state
exit ≠0 (crash, Ctrl-C) → snapshot + preserve + show resume hint
capability-block → (already snapshotted/preserved by apply
before teardown; this path is a no-op
because the container is already gone
by the time exec_claude returns)
snapshot_transcript is best-effort — capability-block's earlier
snapshot is not clobbered when the container is already torn down,
and a missing /home/node/.claude is a warn + skip.
Tested behavior: clean exit doesn't preserve, non-zero exit
(including SIGINT/130 and SIGKILL/137) preserves; empty identity
no-ops both helpers.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -115,7 +115,7 @@ def apply_capability_change(slug: str, new_dockerfile: str) -> tuple[str, str]:
|
||||
raise CapabilityApplyError("proposed Dockerfile is empty")
|
||||
before = fetch_current_dockerfile(slug)
|
||||
|
||||
_snapshot_transcript(slug)
|
||||
snapshot_transcript(slug)
|
||||
_push_working_tree(slug)
|
||||
write_per_bottle_dockerfile(slug, new_dockerfile)
|
||||
# Set the preserve marker BEFORE teardown so cli.py's session-end
|
||||
@@ -139,12 +139,19 @@ def _repo_dockerfile_path() -> Path:
|
||||
return Path(__file__).resolve().parent.parent.parent.parent / "Dockerfile"
|
||||
|
||||
|
||||
def _snapshot_transcript(slug: str) -> None:
|
||||
def snapshot_transcript(slug: str) -> None:
|
||||
"""`docker cp` /home/node/.claude out of the agent container into
|
||||
~/.claude-bottle/state/<slug>/transcript/. Best-effort: missing
|
||||
container, missing dir, or cp error all log a warning and return.
|
||||
The transcript is what `claude --resume` reads to pick up where
|
||||
the agent left off."""
|
||||
the agent left off.
|
||||
|
||||
Called from two places:
|
||||
- capability-apply, before tearing the bottle down.
|
||||
- cli.py's session-end path, before the launch context closes,
|
||||
so a crash or normal exit also leaves a transcript on disk
|
||||
(deleted along with the state dir on clean exit, kept on
|
||||
crash or capability-block per the preserve marker)."""
|
||||
container = _agent_container_name(slug)
|
||||
dest = transcript_snapshot_dir(slug)
|
||||
if dest.exists():
|
||||
@@ -157,11 +164,11 @@ def _snapshot_transcript(slug: str) -> None:
|
||||
)
|
||||
if r.returncode != 0:
|
||||
warn(
|
||||
f"capability-apply: transcript snapshot skipped "
|
||||
f"transcript snapshot skipped "
|
||||
f"({(r.stderr or '').strip() or 'no transcript dir in container?'})"
|
||||
)
|
||||
return
|
||||
info(f"capability-apply: transcript snapshotted to {dest}")
|
||||
info(f"transcript snapshotted to {dest}")
|
||||
|
||||
|
||||
def _push_working_tree(slug: str) -> None:
|
||||
@@ -213,4 +220,5 @@ __all__ = [
|
||||
"CapabilityApplyError",
|
||||
"apply_capability_change",
|
||||
"fetch_current_dockerfile",
|
||||
"snapshot_transcript",
|
||||
]
|
||||
|
||||
+35
-10
@@ -17,7 +17,12 @@ import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from ..backend import BottleSpec, get_bottle_backend
|
||||
from ..backend.docker.bottle_state import cleanup_state, is_preserved
|
||||
from ..backend.docker.bottle_state import (
|
||||
cleanup_state,
|
||||
is_preserved,
|
||||
mark_preserved,
|
||||
)
|
||||
from ..backend.docker.capability_apply import snapshot_transcript
|
||||
from ..log import die, info
|
||||
from ..manifest import Manifest
|
||||
from ._common import PROG, USER_CWD, read_tty_line
|
||||
@@ -98,22 +103,42 @@ def _launch_bottle(
|
||||
claude_args = ["--dangerously-skip-permissions"]
|
||||
if remote_control:
|
||||
claude_args.append("--remote-control")
|
||||
bottle.exec_claude(claude_args, tty=True)
|
||||
info(f"session ended; container {bottle.name} will be removed")
|
||||
exit_code = bottle.exec_claude(claude_args, tty=True)
|
||||
info(
|
||||
f"session ended (exit {exit_code}); "
|
||||
f"container {bottle.name} will be removed"
|
||||
)
|
||||
# While the container is still alive: always snapshot the
|
||||
# transcript and — if the agent exited non-zero — mark
|
||||
# the state for preservation. Capability-block already
|
||||
# did both before triggering teardown from the dashboard;
|
||||
# this picks up crashes / Ctrl-Cs / OOM kills the same
|
||||
# way. snapshot_transcript is best-effort so the
|
||||
# capability-block path's prior snapshot isn't clobbered
|
||||
# when the container is already gone.
|
||||
_capture_session_state(identity, exit_code)
|
||||
# Context exited → containers + networks gone. Now decide
|
||||
# what to do with the per-bottle state dir on the host:
|
||||
# capability-block apply sets the preserve marker before it
|
||||
# tears the bottle down, so the operator can resume from the
|
||||
# new Dockerfile + transcript snapshot. Any other session
|
||||
# end (normal exit, agent crash, Ctrl-C) leaves no marker,
|
||||
# and the state dir gets cleaned up so ~/.claude-bottle/state/
|
||||
# doesn't accumulate per-launch debris.
|
||||
# what to do with the per-bottle state dir on the host: any
|
||||
# preserve marker (capability-block OR crash) keeps it; a
|
||||
# clean exit cleans it up so ~/.claude-bottle/state/ doesn't
|
||||
# accumulate per-launch debris.
|
||||
_settle_state(identity)
|
||||
return 0
|
||||
finally:
|
||||
shutil.rmtree(stage_dir, ignore_errors=True)
|
||||
|
||||
|
||||
def _capture_session_state(identity: str, exit_code: int) -> None:
|
||||
"""Inside the launch context, while the container is still
|
||||
alive: snapshot the transcript and mark for preservation if
|
||||
claude crashed. Pure-function-ish; tests stub the helpers."""
|
||||
if not identity:
|
||||
return
|
||||
snapshot_transcript(identity)
|
||||
if exit_code != 0:
|
||||
mark_preserved(identity)
|
||||
|
||||
|
||||
def _settle_state(identity: str) -> None:
|
||||
if not identity:
|
||||
return
|
||||
|
||||
Reference in New Issue
Block a user