feat(bottle): random-suffix identity + cli.py resume <identity>

Replaces the cwd-hash identity with a random 5-char base36 suffix per launch, so two simultaneous `start <agent>` invocations against the same cwd no longer collide on container names. Each launch is its own bottle. State carries metadata: every prepare step writes ~/.claude-bottle/state/<identity>/metadata.json with the (agent_name, cwd, copy_cwd, started_at) the bottle was launched with. The new `cli.py resume <identity>` reads this metadata and re-launches a bottle pinned to the same identity — picking up the per-bottle Dockerfile (from a prior capability-block apply) and the transcript snapshot under the same state dir. - bottle_state.py: bottle_identity(agent_name) drops the cwd param and gains a random suffix; BottleMetadata dataclass + read/write/metadata_path helpers. - BottleSpec gains an optional identity field — resume sets it to pin the identity; start leaves it empty so prepare mints fresh. - prepare.py: writes metadata at launch time; uses spec.identity if provided (resume) else bottle_identity(agent_name) (fresh start). - start.py: extracted _launch_bottle from cmd_start so resume can share the launch core; prints `./cli.py resume <identity>` hint at session end. - cli/resume.py (new): reads metadata, reconstructs BottleSpec with the recorded identity + cwd, delegates to _launch_bottle. Errors clearly when no state exists for the given identity. - cli/__init__.py: registers `resume` in COMMANDS + usage. - dashboard.py: capability-block approval status line now appends the `resume <identity>` hint so the operator can copy-paste the rebuild command without leaving the TUI. Closes the rebuild loop in PRD 0016: agent calls capability-block → operator approves → bottle torn down with state preserved → status line shows resume command → operator runs it → replacement bottle boots with the new Dockerfile and prior transcript. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 06:09:45 -04:00
parent e996f72532
commit 4032e04a9c
8 changed files with 311 additions and 76 deletions
@@ -53,6 +53,11 @@ class BottleSpec:
    agent_name: str
    copy_cwd: bool
    user_cwd: str
+    # PRD 0016 follow-up: when set, the backend's prepare step uses
+    # this identity instead of minting a fresh one — the resume path
+    # (`cli.py resume <identity>`) sets this to continue an existing
+    # bottle's state. Empty string for a fresh `start`.
+    identity: str = ""


@dataclass(frozen=True)
@@ -1,23 +1,39 @@
 """Per-bottle persistent state (PRD 0016).

 Holds the per-bottle Dockerfile override that capability-block
-remediation writes, plus the transcript snapshot the
-state-preservation helper saves before teardown. State lives at:
+remediation writes, the transcript snapshot the state-preservation
+helper saves before teardown, and the launch metadata that lets
+`cli.py resume <identity>` reconstruct a bottle's spec. State
+lives at:

-    ~/.claude-bottle/state/<slug>/
+    ~/.claude-bottle/state/<identity>/
+        metadata.json     — agent_name + cwd + started_at (for resume)
        Dockerfile        — per-bottle override (absent → use repo's)
        transcript/       — last snapshotted agent state (best-effort)

 When the per-bottle Dockerfile is present, the launch step builds
-the agent image with a per-bottle tag (claude-bottle-rebuilt-<slug>)
+the agent image with a per-bottle tag (claude-bottle-rebuilt-<id>)
 from this file rather than the repo's. The build context is still
 the repo root so the Dockerfile can COPY claude_bottle source files
 the same way the original does.
+
+Identity model:
+- Every `cli.py start <agent>` mints a fresh identity via
+  `bottle_identity(agent_name)`: slug-prefix for readability plus a
+  5-char random suffix for parallel-safe uniqueness. The metadata
+  written at launch time pins (agent_name, cwd) to that identity.
+- `cli.py resume <identity>` reads the metadata and re-launches a
+  bottle pinned to the same identity, picking up any per-bottle
+  Dockerfile and transcript snapshot.
 """

 from __future__ import annotations

-import hashlib
+import dataclasses
+import json
+import secrets
+import string
+from dataclasses import dataclass
 from pathlib import Path

 from ... import supervise as _supervise
@@ -28,33 +44,73 @@ from . import util as docker_mod
 _STATE_SUBDIR = "state"
 _PER_BOTTLE_DOCKERFILE_NAME = "Dockerfile"
 _TRANSCRIPT_SUBDIR = "transcript"
+_METADATA_NAME = "metadata.json"

-# How many hex chars of the cwd hash to fold into the identity. 12
-# hex chars = 48 bits of entropy; the cost of a collision is two
-# unrelated cwds sharing the same state — annoying but not security-
-# relevant. 12 keeps the identity short enough to stay readable in
-# container names and `ls` output.
-_CWD_HASH_LEN = 12
+# 5 chars of base36 alphabet ≈ 60M combinations. Plenty for human
+# operators starting bottles by hand; collision-free in practice.
+_RANDOM_SUFFIX_LEN = 5
+_SUFFIX_ALPHABET = string.ascii_lowercase + string.digits


-def bottle_identity(agent_name: str, cwd: Path | None) -> str:
-    """Stable, unique identifier for a bottle. Used as the key for
-    every persistent and runtime resource: container names, network
-    names, queue dir, audit log, per-bottle Dockerfile state.
+def bottle_identity(agent_name: str) -> str:
+    """Mint a fresh per-launch bottle identity. The slug-prefix is
+    `slugify(agent_name)` for readability; the suffix is 5 random
+    base36 chars so two simultaneous `start <agent>` invocations
+    don't collide on container/network names.

-    Without --cwd, the identity is just `slugify(agent_name)` — the
-    same value the codebase used to compute as `slug`. With --cwd
-    the identity is `slugify(agent_name)-<sha256(resolved-cwd)[:N]>`
-    so the same agent against different projects gets distinct
-    state. Same agent against the same cwd is stable across launches.
-
-    `cwd` should be the path the agent will see, *resolved* by the
-    caller, or None when no cwd was passed (no --cwd flag)."""
+    Every call produces a different identity (non-deterministic).
+    To continue an existing bottle's state, use the recorded
+    identity from BottleMetadata via `cli.py resume <identity>`,
+    not this function."""
    slug = docker_mod.slugify(agent_name)
-    if cwd is None:
-        return slug
-    h = hashlib.sha256(str(cwd).encode("utf-8")).hexdigest()
-    return f"{slug}-{h[:_CWD_HASH_LEN]}"
+    suffix = "".join(secrets.choice(_SUFFIX_ALPHABET) for _ in range(_RANDOM_SUFFIX_LEN))
+    return f"{slug}-{suffix}"
+
+
+@dataclass(frozen=True)
+class BottleMetadata:
+    """Persistent record of how a bottle was launched, written at
+    start time and read by `cli.py resume`. Lives at
+    ~/.claude-bottle/state/<identity>/metadata.json."""
+
+    identity: str
+    agent_name: str
+    cwd: str           # empty string when --cwd was not passed
+    copy_cwd: bool
+    started_at: str    # ISO 8601 UTC
+
+
+def metadata_path(identity: str) -> Path:
+    return bottle_state_dir(identity) / _METADATA_NAME
+
+
+def write_metadata(metadata: BottleMetadata) -> Path:
+    """Persist `metadata` to ~/.claude-bottle/state/<identity>/metadata.json.
+    Mode 0o644 — no secrets, just (agent_name, cwd, timestamp)."""
+    path = metadata_path(metadata.identity)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(dataclasses.asdict(metadata), indent=2) + "\n")
+    path.chmod(0o644)
+    return path
+
+
+def read_metadata(identity: str) -> BottleMetadata | None:
+    """Return the metadata for `identity`, or None if no state has
+    been recorded for it. Used by `cli.py resume` to reconstruct
+    the launch spec."""
+    path = metadata_path(identity)
+    if not path.is_file():
+        return None
+    raw = json.loads(path.read_text())
+    if not isinstance(raw, dict):
+        return None
+    return BottleMetadata(
+        identity=str(raw.get("identity", identity)),
+        agent_name=str(raw.get("agent_name", "")),
+        cwd=str(raw.get("cwd", "")),
+        copy_cwd=bool(raw.get("copy_cwd", False)),
+        started_at=str(raw.get("started_at", "")),
+    )


 def bottle_state_dir(identity: str) -> Path:
@@ -100,11 +156,15 @@ def transcript_snapshot_dir(identity: str) -> Path:


 __all__ = [
+    "BottleMetadata",
    "bottle_identity",
    "bottle_state_dir",
+    "metadata_path",
    "per_bottle_dockerfile",
    "per_bottle_dockerfile_path",
    "per_bottle_image_tag",
+    "read_metadata",
    "transcript_snapshot_dir",
+    "write_metadata",
    "write_per_bottle_dockerfile",
 ]
@@ -11,6 +11,7 @@ via the base class's `prepare` template before this is called.
 from __future__ import annotations

 import os
+from datetime import datetime, timezone
 from pathlib import Path

 from ... import pipelock
@@ -27,10 +28,12 @@ from .cred_proxy import (
 )
 from .git_gate import DockerGitGate, git_gate_container_name
 from .bottle_state import (
+    BottleMetadata,
    bottle_identity,
    per_bottle_dockerfile,
    per_bottle_dockerfile_path,
    per_bottle_image_tag,
+    write_metadata,
 )
 from .pipelock import DockerPipelockProxy, pipelock_container_name
 from .supervise import DockerSupervise, supervise_container_name
@@ -54,16 +57,22 @@ def resolve_plan(
    agent = manifest.agents[spec.agent_name]
    bottle = manifest.bottle_for(spec.agent_name)

-    # PRD 0016 follow-up: identity, not bare slug. With --cwd, the
-    # identity carries a sha256(cwd) suffix so the same agent against
-    # different projects gets distinct container names, networks,
-    # queue + audit + state dirs. Without --cwd, identity ==
-    # slugify(agent_name) — same value the old `slug` produced — so
-    # no-cwd bottles look unchanged. We keep the variable named `slug`
-    # because every downstream module already threads it under that
-    # name; the value is now the bottle's full identity.
-    cwd_for_identity = Path(spec.user_cwd).resolve() if spec.copy_cwd else None
-    slug = bottle_identity(spec.agent_name, cwd_for_identity)
+    # PRD 0016 follow-up: identity, not bare slug. A fresh `start`
+    # mints a random-suffixed identity (so parallel runs of the same
+    # agent in the same cwd don't collide on container/network
+    # names); a `resume` passes the recorded identity in via
+    # spec.identity to continue an existing bottle's state.
+    slug = spec.identity or bottle_identity(spec.agent_name)
+    # Record the launch metadata so `cli.py resume <identity>` can
+    # reconstruct the spec. Idempotent — re-writes on resume with a
+    # refreshed started_at.
+    write_metadata(BottleMetadata(
+        identity=slug,
+        agent_name=spec.agent_name,
+        cwd=spec.user_cwd if spec.copy_cwd else "",
+        copy_cwd=spec.copy_cwd,
+        started_at=datetime.now(timezone.utc).isoformat(),
+    ))

    # PRD 0016 capability-block: if a per-bottle Dockerfile has been
    # written (via apply_capability_change), the base image becomes
@@ -1,6 +1,6 @@
 """Main CLI dispatcher.

-Commands: cleanup, dashboard, edit, info, init, list, start
+Commands: cleanup, dashboard, edit, info, init, list, resume, start
 """

 from __future__ import annotations
@@ -15,6 +15,7 @@ from .dashboard import cmd_dashboard
 from .edit import cmd_edit
 from .info import cmd_info
 from .init import cmd_init
+from .resume import cmd_resume
 from .start import cmd_start

 cmd_list = _list_mod.cmd_list
@@ -26,6 +27,7 @@ COMMANDS = {
    "info": cmd_info,
    "init": cmd_init,
    "list": cmd_list,
+    "resume": cmd_resume,
    "start": cmd_start,
 }

@@ -39,6 +41,7 @@ def usage() -> None:
    sys.stderr.write("  info      print env, skills, and prompt details for a named agent\n")
    sys.stderr.write("  init      interactively create a new agent and add it to claude-bottle.json\n")
    sys.stderr.write("  list      list available agents or active containers\n")
+    sys.stderr.write("  resume    re-launch a bottle by its identity (continues state from PRD 0016)\n")
    sys.stderr.write("  start     boot a container for a named agent and attach an interactive session\n\n")
    sys.stderr.write(f"Run '{PROG} <command> --help' for command-specific usage.\n")

@@ -112,6 +112,16 @@ def discover_pipelock_slugs() -> list[str]:
    return _discover_sidecar_slugs("claude-bottle-pipelock-")


+def _approval_status(qp: QueuedProposal, verb: str) -> str:
+    """Status-line text after a successful approval. For capability-
+    block, append the `resume <identity>` hint so the operator can
+    bring the rebuilt bottle back up with one copy-paste."""
+    base = f"{verb} {qp.proposal.tool} for [{qp.proposal.bottle_slug}]"
+    if qp.proposal.tool == TOOL_CAPABILITY_BLOCK:
+        return f"{base}; resume: ./cli.py resume {qp.proposal.bottle_slug}"
+    return base
+
+
 def discover_pending() -> list[QueuedProposal]:
    """Walk ~/.claude-bottle/queue/* and collect pending proposals
    from every bottle's queue. Sorted by arrival time across the
@@ -371,7 +381,7 @@ def _main_loop(stdscr: "curses._CursesWindow") -> None:
        elif key == ord("a"):
            try:
                approve(qp)
-                status_line = f"approved {qp.proposal.tool} for [{qp.proposal.bottle_slug}]"
+                status_line = _approval_status(qp, "approved")
            except ApplyError as e:
                status_line = f"apply failed: {e}"
        elif key == ord("m"):
@@ -381,7 +391,7 @@ def _main_loop(stdscr: "curses._CursesWindow") -> None:
            else:
                try:
                    approve(qp, final_file=edited, notes="operator modified before approving")
-                    status_line = f"modified+approved {qp.proposal.tool} for [{qp.proposal.bottle_slug}]"
+                    status_line = _approval_status(qp, "modified+approved")
                except ApplyError as e:
                    status_line = f"apply failed: {e}"
        elif key == ord("r"):
@@ -0,0 +1,66 @@
+"""resume: re-launch a bottle by its identity.
+
+Reads ~/.claude-bottle/state/<identity>/metadata.json to recover the
+(agent_name, cwd, copy_cwd) the bottle was originally started with,
+then runs the same launch core as `start` — but pinned to the
+recorded identity so the new bottle picks up any per-bottle Dockerfile
+(from capability-block apply) and transcript snapshot under the same
+state dir.
+
+Use case: an agent calls capability-block, the dashboard approves
+and tears down the bottle, the operator runs
+    ./cli.py resume <identity>
+to bring up the replacement with the new capabilities baked in.
+"""
+
+from __future__ import annotations
+
+import argparse
+
+from ..backend import BottleSpec
+from ..backend.docker.bottle_state import read_metadata
+from ..log import die
+from ..manifest import Manifest
+from ._common import PROG, USER_CWD
+from .start import _launch_bottle
+
+
+def cmd_resume(argv: list[str]) -> int:
+    parser = argparse.ArgumentParser(prog=f"{PROG} resume", add_help=True)
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--remote-control", action="store_true")
+    parser.add_argument(
+        "--format",
+        choices=("text", "json"),
+        default="text",
+        help="preflight output format; --format=json requires --dry-run",
+    )
+    parser.add_argument(
+        "identity",
+        help="bottle identity from a prior `start` (see its session-end output)",
+    )
+    args = parser.parse_args(argv)
+
+    metadata = read_metadata(args.identity)
+    if metadata is None:
+        die(
+            f"no state recorded for identity {args.identity!r}; "
+            f"check ~/.claude-bottle/state/ or run `cli.py start` to create a new bottle"
+        )
+
+    manifest = Manifest.resolve(USER_CWD)
+    manifest.require_agent(metadata.agent_name)
+
+    spec = BottleSpec(
+        manifest=manifest,
+        agent_name=metadata.agent_name,
+        copy_cwd=metadata.copy_cwd,
+        user_cwd=metadata.cwd or USER_CWD,
+        identity=metadata.identity,
+    )
+    return _launch_bottle(
+        spec,
+        dry_run=args.dry_run,
+        output_format=args.format,
+        remote_control=args.remote_control,
+    )
@@ -1,6 +1,10 @@
 """start: boot a sandboxed container for a named agent and attach an
 interactive claude-code session. The container is torn down when the
-session ends."""
+session ends.
+
+The launch core is shared with `cli.py resume <identity>`: see
+_launch_bottle below.
+"""

 from __future__ import annotations

@@ -43,18 +47,35 @@ def cmd_start(argv: list[str]) -> int:
        copy_cwd=args.cwd,
        user_cwd=USER_CWD,
    )
+    return _launch_bottle(
+        spec,
+        dry_run=dry_run,
+        output_format=args.format,
+        remote_control=args.remote_control,
+    )

+
+def _launch_bottle(
+    spec: BottleSpec,
+    *,
+    dry_run: bool,
+    output_format: str,
+    remote_control: bool,
+) -> int:
+    """Shared launch core for `start` and `resume`. Builds the plan,
+    prints / dry-runs / prompts as appropriate, brings the bottle up,
+    attaches claude, and prints the resume hint on session end."""
    stage_dir = Path(tempfile.mkdtemp(prefix="claude-bottle-stage."))
    try:
        backend = get_bottle_backend()
        plan = backend.prepare(spec, stage_dir=stage_dir)

-        if args.format == "json":
-            json.dump(plan.to_dict(remote_control=args.remote_control), sys.stdout, indent=2)
+        if output_format == "json":
+            json.dump(plan.to_dict(remote_control=remote_control), sys.stdout, indent=2)
            sys.stdout.write("\n")
            return 0

-        plan.print(remote_control=args.remote_control)
+        plan.print(remote_control=remote_control)

        if dry_run:
            info("dry-run requested; not starting container.")
@@ -67,16 +88,27 @@ def cmd_start(argv: list[str]) -> int:
            info("aborted by user")
            return 0

+        identity = _identity_from_plan(plan)
        with backend.launch(plan) as bottle:
            info(
                "attaching interactive claude session "
                "(Ctrl-D or 'exit' to leave; container will be removed)"
            )
            claude_args = ["--dangerously-skip-permissions"]
-            if args.remote_control:
+            if remote_control:
                claude_args.append("--remote-control")
            bottle.exec_claude(claude_args, tty=True)
            info(f"session ended; container {bottle.name} will be removed")
+            if identity:
+                info(f"to resume this bottle: ./cli.py resume {identity}")
            return 0
    finally:
        shutil.rmtree(stage_dir, ignore_errors=True)
+
+
+def _identity_from_plan(plan: object) -> str:
+    """Backend-specific: the docker plan exposes the identity as
+    `.slug`. Other backends in the future would expose their own
+    identity attribute; for now we duck-type to keep this layer
+    backend-agnostic."""
+    return getattr(plan, "slug", "")
@@ -1,4 +1,5 @@
-"""Unit: per-bottle state helpers (PRD 0016 Phase 1) + identity."""
+"""Unit: per-bottle state helpers (PRD 0016 Phase 1) + identity +
+launch metadata."""

 import re
 import tempfile
@@ -7,6 +8,11 @@ from pathlib import Path

 from claude_bottle import supervise
 from claude_bottle.backend.docker import bottle_state
+from claude_bottle.backend.docker.bottle_state import (
+    BottleMetadata,
+    read_metadata,
+    write_metadata,
+)


 class _FakeHomeMixin:
@@ -70,46 +76,90 @@ class TestPerBottleDockerfile(_FakeHomeMixin, unittest.TestCase):


 class TestBottleIdentity(unittest.TestCase):
-    """bottle_identity(agent_name, cwd) — PRD 0016 follow-up.
+    """bottle_identity(agent_name) — PRD 0016 follow-up.

-    Without --cwd, identity == slugify(agent_name) so existing
-    no-cwd bottles look unchanged. With --cwd, identity has a
-    cwd-hash suffix so the same agent against different projects
-    gets distinct container / queue / audit / state dirs."""
+    Every call mints a fresh identity with a random 5-char suffix
+    so multiple instances of the same agent can run in parallel
+    without container name collisions. The slug-prefix is for
+    readability; the suffix is for uniqueness. To continue an
+    existing bottle, use the recorded identity via
+    `cli.py resume <identity>`, not this function."""

-    def test_no_cwd_returns_slug(self):
-        self.assertEqual("dev", bottle_state.bottle_identity("dev", None))
-        self.assertEqual("api-foo", bottle_state.bottle_identity("Api Foo", None))
-
-    def test_cwd_appends_hash_suffix(self):
-        identity = bottle_state.bottle_identity("dev", Path("/proj/A"))
+    def test_format_is_slug_dash_5_alnum(self):
+        identity = bottle_state.bottle_identity("dev")
        self.assertTrue(identity.startswith("dev-"))
        suffix = identity[len("dev-"):]
-        self.assertEqual(12, len(suffix))
-        self.assertTrue(re.fullmatch(r"[0-9a-f]+", suffix), suffix)
+        self.assertEqual(5, len(suffix))
+        self.assertTrue(
+            re.fullmatch(r"[a-z0-9]+", suffix),
+            f"suffix {suffix!r} must be lowercase base36",
+        )

-    def test_same_cwd_same_identity(self):
-        a = bottle_state.bottle_identity("dev", Path("/proj/A"))
-        b = bottle_state.bottle_identity("dev", Path("/proj/A"))
-        self.assertEqual(a, b)
-
-    def test_different_cwds_differ(self):
-        a = bottle_state.bottle_identity("dev", Path("/proj/A"))
-        b = bottle_state.bottle_identity("dev", Path("/proj/B"))
+    def test_two_calls_yield_different_identities(self):
+        # 5-char base36 gives ~60M combinations; collision in two
+        # calls is astronomically unlikely. If this ever flakes it's
+        # almost certainly a regression, not a bad-luck collision.
+        a = bottle_state.bottle_identity("dev")
+        b = bottle_state.bottle_identity("dev")
        self.assertNotEqual(a, b)

-    def test_different_agents_same_cwd_differ(self):
-        a = bottle_state.bottle_identity("dev", Path("/proj/A"))
-        b = bottle_state.bottle_identity("api", Path("/proj/A"))
-        self.assertNotEqual(a, b)
+    def test_different_agents_get_different_prefixes(self):
+        a = bottle_state.bottle_identity("dev")
+        b = bottle_state.bottle_identity("api")
+        self.assertTrue(a.startswith("dev-"))
+        self.assertTrue(b.startswith("api-"))

    def test_agent_name_slugified(self):
-        # Identity's agent-name prefix is slugify(name), not the raw
-        # name — same rule the rest of the codebase has always used.
-        self.assertEqual(
-            "my-agent",
-            bottle_state.bottle_identity("My Agent", None),
+        identity = bottle_state.bottle_identity("My Agent")
+        self.assertTrue(identity.startswith("my-agent-"))
+
+
+class TestBottleMetadata(_FakeHomeMixin, unittest.TestCase):
+    def setUp(self):
+        self._setup_fake_home()
+
+    def tearDown(self):
+        self._teardown_fake_home()
+
+    def test_read_missing_returns_none(self):
+        self.assertIsNone(read_metadata("does-not-exist"))
+
+    def test_write_then_read_roundtrip(self):
+        meta = BottleMetadata(
+            identity="dev-a4f8c",
+            agent_name="dev",
+            cwd="/proj/A",
+            copy_cwd=True,
+            started_at="2026-05-25T12:00:00+00:00",
        )
+        write_metadata(meta)
+        loaded = read_metadata("dev-a4f8c")
+        self.assertEqual(meta, loaded)
+
+    def test_metadata_lives_under_state_dir(self):
+        meta = BottleMetadata(
+            identity="dev-x", agent_name="dev",
+            cwd="", copy_cwd=False, started_at="t",
+        )
+        path = write_metadata(meta)
+        self.assertTrue(
+            str(path).endswith("/.claude-bottle/state/dev-x/metadata.json"),
+        )
+
+    def test_overwriting_metadata_updates_timestamp(self):
+        # `resume` re-writes metadata with a fresh started_at;
+        # everything else stays the same.
+        write_metadata(BottleMetadata(
+            identity="dev-y", agent_name="dev",
+            cwd="/proj/A", copy_cwd=True, started_at="t1",
+        ))
+        write_metadata(BottleMetadata(
+            identity="dev-y", agent_name="dev",
+            cwd="/proj/A", copy_cwd=True, started_at="t2",
+        ))
+        loaded = read_metadata("dev-y")
+        assert loaded is not None
+        self.assertEqual("t2", loaded.started_at)


 if __name__ == "__main__":