2026-06-02 02:31:37 -04:00 · 2026-06-02 02:27:04 -04:00
6 changed files with 548 additions and 193 deletions
@@ -21,7 +21,6 @@ from __future__ import annotations
 import dataclasses
 import os
 import time
 from contextlib import ExitStack, contextmanager
 from pathlib import Path
 from typing import Callable, Generator
@@ -94,25 +93,60 @@ def launch(
    via the ExitStack."""
    stack = ExitStack()
    try:
-        # 1. Reserve a loopback alias for this bottle. macOS only
+        loopback_ip, network = _allocate_resources(plan, stack)
-        # routes 127.0.0.1 by default; the per-bottle alias is
+        plan = _mint_certs(plan)
-        # what bundles the docker port-publishes and TSI allowlist
+        plan = _start_bundle(plan, network, loopback_ip, stack)
-        # against, so this bottle can't reach other bottles' (or
+        plan = _discover_urls(plan, loopback_ip)
-        # other host services') ports on the loopback. Lazy
+
-        # sudo-driven on first use per boot. No-op on Linux.
+        # Build the agent image and pack it into a `.smolmachine`
        # artifact (or hit the per-Dockerfile-digest cache). Runs
        # here, not in prepare, so the docker-build output doesn't
        # garble the dashboard's preflight modal.
        agent_from_path = _ensure_smolmachine(
            plan.agent_image_ref,
            dockerfile=plan.agent_dockerfile_path,
        )
        _launch_vm(plan, agent_from_path, loopback_ip, stack)
        _init_vm(plan)
        prompt_path = provision(plan, plan.machine_name)
        yield SmolmachinesBottle(
            plan.machine_name,
            prompt_path=prompt_path,
            guest_env=plan.guest_env,
            agent_command=plan.agent_command,
            agent_prompt_mode=plan.agent_prompt_mode,
        )
    finally:
        stack.close()
 def _allocate_resources(
    plan: SmolmachinesBottlePlan,
    stack: ExitStack,
 ) -> tuple[str, str]:
    """Reserve a loopback alias and create the per-bottle docker bridge.
    macOS only routes 127.0.0.1 by default; the per-bottle alias
    scopes TSI's allowlist to this bottle's published ports so the
    agent can't reach other bottles' or host services' ports on
    loopback. No-op on Linux."""
    _loopback.ensure_pool()
    loopback_ip = _loopback.allocate(plan.slug)
        # 2. Per-bottle docker bridge.
    network = _bundle.bundle_network_name(plan.slug)
    _bundle.create_bundle_network(network, plan.bundle_subnet, plan.bundle_gateway)
    stack.callback(_bundle.remove_bundle_network, network)
    return loopback_ip, network
-        # 2. Mint per-bottle CAs and update the inner Plans with
+
-        # their launch-time paths. pipelock always runs in the
+def _mint_certs(plan: SmolmachinesBottlePlan) -> SmolmachinesBottlePlan:
-        # bundle; egress's CA is only minted when the bottle
+    """Mint per-bottle CAs and return the plan with CA paths filled.
-        # declares routes (otherwise egress runs idle without
+
-        # MITM and the CA files would be unused).
+    Pipelock always runs in the bundle. Egress's CA is only minted
    when the bottle declares routes — otherwise egress runs idle
    without MITM and the CA files would be unused."""
    ca_cert_host, ca_key_host = pipelock_tls_init(plan.proxy_plan.yaml_path.parent)
    proxy_plan = dataclasses.replace(
        plan.proxy_plan,
@@ -129,41 +163,47 @@ def launch(
            mitmproxy_ca_host_path=egress_ca_host,
            mitmproxy_ca_cert_only_host_path=egress_ca_cert_only,
            pipelock_ca_host_path=ca_cert_host,
-                # On smolmachines, egress's upstream is pipelock
+            # On smolmachines, egress's upstream is pipelock on the
-                # on the bundle's localhost — they're in the same
+            # bundle's localhost — they're in the same container's
-                # container's network namespace.
+            # network namespace.
            pipelock_proxy_url=BUNDLE_LOCAL_PIPELOCK_URL,
        )
-        plan = dataclasses.replace(
+    return dataclasses.replace(plan, proxy_plan=proxy_plan, egress_plan=egress_plan)
            plan, proxy_plan=proxy_plan, egress_plan=egress_plan,
        )
-        # 3. Build the BundleLaunchSpec from the (now-resolved)
+
-        # inner Plans: daemon subset, env, bind-mounts, and the
+def _start_bundle(
-        # loopback alias to bind published ports against. The
+    plan: SmolmachinesBottlePlan,
-        # spec's ports_to_publish list expands depending on which
+    network: str,
-        # daemons the agent needs to reach from the smolvm guest.
+    loopback_ip: str,
    stack: ExitStack,
 ) -> SmolmachinesBottlePlan:
    """Build the BundleLaunchSpec, resolve token env, start the
    sidecar bundle container, and register teardown."""
    bundle_spec = _bundle_launch_spec(plan, network, loopback_ip)
    token_env = _resolve_token_env(plan, dict(os.environ))
    _bundle.ensure_bundle_image(bundle_spec.image)
    _bundle.start_bundle(bundle_spec, env={**os.environ, **token_env})
    stack.callback(_bundle.stop_bundle, plan.slug)
    return plan
-        # 4. Discover the host-side ports docker assigned for the
+
-        # bundle's published container ports, and bind the
+def _discover_urls(
-        # agent's URLs to `<loopback_ip>:<host port>`. Docker
+    plan: SmolmachinesBottlePlan,
-        # container IPs (192.168.x.x in the daemon's bridge)
+    loopback_ip: str,
-        # aren't reachable from the smolvm guest on macOS — TSI
+) -> SmolmachinesBottlePlan:
-        # uses macOS networking, and macOS sees the daemon's
+    """Discover host-side ports for published container ports and
-        # bridge via the published-port loopback forward only.
+    return the plan with URLs + guest_env stamped in.
-        #
+
-        # Proxy hop order matches the docker backend: when the
+    Docker container IPs (192.168.x.x in the daemon's bridge)
-        # bottle declares egress routes, the agent's first hop is
+    aren't reachable from the smolvm guest on macOS — TSI uses
-        # egress (for token injection), then pipelock. Without
+    macOS networking, and macOS sees the daemon's bridge via the
-        # routes, the agent dials pipelock directly. Whichever
+    published-port loopback forward only.
-        # one is "agent-facing" is the daemon whose port we
+
-        # publish on host loopback; the other stays bundle-
+    Proxy hop order: when the bottle declares egress routes, the
-        # internal as the upstream proxy.
+    agent's first hop is egress (for token injection), then
    pipelock. Without routes, the agent dials pipelock directly.
    NO_PROXY includes the per-bottle loopback alias so the
    supervise + git-gate URLs bypass HTTPS_PROXY."""
    if plan.egress_plan.routes:
        agent_facing_port = _EGRESS_PORT
    else:
@@ -172,12 +212,14 @@ def launch(
        plan.slug, agent_facing_port, host_ip=loopback_ip,
    )
    agent_proxy_url = f"http://{loopback_ip}:{agent_facing_host_port}"
    agent_git_gate_host = ""
    if plan.git_gate_plan.upstreams:
        git_gate_host_port = _bundle.bundle_host_port(
            plan.slug, _GIT_HTTP_PORT, host_ip=loopback_ip,
        )
        agent_git_gate_host = f"{loopback_ip}:{git_gate_host_port}"
    agent_supervise_url = ""
    if plan.supervise_plan is not None:
        supervise_host_port = _bundle.bundle_host_port(
@@ -185,20 +227,6 @@ def launch(
        )
        agent_supervise_url = f"http://{loopback_ip}:{supervise_host_port}/"
        # Stamp the URLs onto the plan + guest_env. provision_git
        # and provision_supervise read the plan fields; the agent
        # reads guest_env on every exec_agent.
        #
        # NO_PROXY has to include the per-bottle loopback alias —
        # otherwise claude's HTTPS_PROXY catches direct calls to
        # the supervise URL (`http://<alias>:<port>/`) and proxies
        # them through egress, which has no route for the alias
        # and rejects with "Failed to connect". The smolmachines
        # git-gate URL uses smart HTTP, so it also has to bypass
        # the agent's HTTP_PROXY and go straight to the host-
        # published git HTTP endpoint. Append rather than overwrite
        # so prepare.py's
        # `localhost,127.0.0.1` baseline stays in place.
    existing_no_proxy = plan.guest_env.get("NO_PROXY", "localhost,127.0.0.1")
    guest_env = {
        **plan.guest_env,
@@ -210,7 +238,8 @@ def launch(
        guest_env["GIT_GATE_URL"] = f"http://{agent_git_gate_host}"
    if agent_supervise_url:
        guest_env["MCP_SUPERVISE_URL"] = agent_supervise_url
-        plan = dataclasses.replace(
+
    return dataclasses.replace(
        plan,
        guest_env=guest_env,
        agent_proxy_url=agent_proxy_url,
@@ -218,25 +247,20 @@ def launch(
        agent_supervise_url=agent_supervise_url,
    )
        # 5. Build the agent image and pack it into a
        # `.smolmachine` artifact (or hit the per-Dockerfile-digest
        # cache). Runs here, not in prepare, so the docker-build
        # output doesn't garble the dashboard's preflight modal:
        # both the curses-endwin path and the tmux pane-routing
        # path redirect stderr around `launch` already.
        agent_from_path = _ensure_smolmachine(
            plan.agent_image_ref,
            dockerfile=plan.agent_dockerfile_path,
        )
-        # smolvm VM. --from carries the pre-packed .smolmachine
+def _launch_vm(
-        # artifact; --allow-cidr + -e carry the per-bottle TSI
+    plan: SmolmachinesBottlePlan,
-        # allowlist + env. The allowlist is the per-bottle
+    agent_from_path: Path,
-        # loopback alias — narrowing it to one /32 keeps the
+    loopback_ip: str,
-        # agent from reaching other host loopback services or
+    stack: ExitStack,
-        # other bottles' published ports. Smolfile isn't usable
+) -> None:
-        # here — smolvm 0.8.0 makes `--from` and `--smolfile`
+    """Create, patch, and start the smolvm VM; register teardown.
-        # mutually exclusive.
+
    --allow-cidr is the per-bottle loopback alias so the guest can
    only reach this bottle's bundle ports. force_allowlist patches
    smolvm 0.8.0's silent-drop of --allow-cidr when combined with
    --from. Smolfile isn't usable here — smolvm 0.8.0 makes --from
    and --smolfile mutually exclusive."""
    _smolvm.machine_create(
        plan.machine_name,
        from_path=agent_from_path,
@@ -244,61 +268,33 @@ def launch(
        env=plan.guest_env,
    )
    stack.callback(_smolvm.machine_delete, plan.machine_name)
-        # Workaround smolvm 0.8.0: `--allow-cidr` is silently
+    # Workaround smolvm 0.8.0: `--allow-cidr` is silently dropped
-        # dropped when combined with `--from`. Patch the persisted
+    # when combined with `--from`. Patch the persisted state DB
-        # state DB to set the allowlist before start so the booted
+    # before start so the booted VM's TSI actually enforces.
        # VM's TSI actually enforces. See loopback_alias's module
        # docstring for the investigation that led here.
    _loopback.force_allowlist(plan.machine_name, [f"{loopback_ip}/32"])
    _smolvm.machine_start(plan.machine_name)
    stack.callback(_smolvm.machine_stop, plan.machine_name)
-        # 6. Repair filesystem ownership + perms that smolvm's
+
-        # pack process remapped to the host invoker's uid (501
+def _init_vm(plan: SmolmachinesBottlePlan) -> None:
-        # on macOS) rather than preserving the image's expected
+    """Repair filesystem ownership and wait for exec channel readiness.
-        # ownership.
+
-        #
+    Ownership repair: smolvm's pack process remaps files to the host
-        #  - /home/node → node:node so the node user can write
+    invoker's uid (501 on macOS). /home/node must be node:node so
-        #    its own dotfiles (claude appendFileSync on
+    Claude Code can write ~/.claude.json; /tmp + /var/tmp need root
-        #    ~/.claude.json otherwise bails with ENOENT/EPERM
+    mode 1777 so non-root processes can create per-uid scratch dirs.
-        #    and the TUI hangs without surfacing the error).
+    All folded into one sh -c to avoid back-to-back exec calls
-        #  - /tmp + /var/tmp → root:root mode 1777 so non-root
+    immediately after machine_start (libkrun exec-channel race).
-        #    processes can create their per-uid scratch dirs
+
-        #    (claude-code creates /tmp/claude-<uid>/ as soon as
+    wait_exec_ready polls until the exec channel is ready for the
-        #    it spawns a Bash tool call).
+    subsequent provision calls, replacing the empirical sleep."""
        #
        # All folded into one sh -c so we only pay one
        # machine_exec round trip — back-to-back exec calls
        # right after machine_start hit a SIGKILL race in
        # libkrun's exec channel (see provision_ca for the
        # other half of this same workaround).
    _smolvm.machine_exec(plan.machine_name, [
        "sh", "-c",
        "chown -R node:node /home/node && "
        "chown root:root /tmp /var/tmp && "
        "chmod 1777 /tmp /var/tmp",
    ])
-
+    _smolvm.wait_exec_ready(plan.machine_name)
        # Wait briefly for the VM to settle. Back-to-back smolvm
        # machine_exec calls immediately after machine_start
        # occasionally SIGKILL the in-VM child at ~100ms (looks
        # like a VM warm-up race in libkrun's exec channel).
        # 1.5s is empirically enough to dodge it; provisioning
        # already takes seconds so the wait is amortized.
        time.sleep(1.5)
        # 7. Provision (CA / prompt / skills / git / supervise).
        prompt_path = provision(plan, plan.machine_name)
        yield SmolmachinesBottle(
            plan.machine_name,
            prompt_path=prompt_path,
            guest_env=plan.guest_env,
            agent_command=plan.agent_command,
            agent_prompt_mode=plan.agent_prompt_mode,
        )
    finally:
        stack.close()
 def _bundle_launch_spec(
@@ -324,10 +320,9 @@ def _bundle_launch_spec(
    # is "agent-facing" gets its port published on the host
    # loopback (see `_ensure_smolmachine`'s discovery loop) and the
    # other stays bundle-internal. The bundle is NOT reachable by
-    # bridge IP from the smolvm guest, so the
+    # bridge IP from the smolvm guest on macOS — TSI uses macOS
-    # PRD-0023-chunk-3 EGRESS_LISTEN_HOST=127.0.0.1 mitigation
+    # networking, and macOS sees the daemon's bridge via the
-    # isn't needed: the agent can only dial whatever daemon's
+    # published-port loopback forward only.
    # host port we publish, period.
    # --- pipelock ---------------------------------------------
    pp = plan.proxy_plan
@@ -45,6 +45,7 @@ alias gets handed to a new bottle."""
 from __future__ import annotations
 import fcntl
 import json
 import os
 import platform
@@ -83,6 +84,14 @@ _POOL_START = 16
 _POOL_END = 31  # inclusive
 # File lock that serialises concurrent allocate() calls so two
 # simultaneous launches can't read the same docker state and claim
 # the same alias. Narrowed to the allocate() call itself; docker run
 # runs after the lock is released. Once the container is running it
 # appears in docker state and future allocate() calls will see it.
 _ALLOC_LOCK_PATH = Path.home() / ".cache" / "bot-bottle" / "smolmachines.lock"
 # Loopback aliases pool: 127.0.0.<start>..127.0.0.<end>.
 def _pool_addresses() -> list[str]:
    return [f"127.0.0.{i}" for i in range(_POOL_START, _POOL_END + 1)]
@@ -179,9 +188,20 @@ def allocate(slug: str) -> str:
    On non-macOS the whole `127.0.0.0/8` is loopback by default;
    `127.0.0.1` is fine to share and we skip the alias dance.
    This still returns a deterministic address so launch.py's
-    callers don't have to branch on platform."""
+    callers don't have to branch on platform.
    An exclusive file lock serialises concurrent calls so two
    simultaneous launches don't read the same docker state and
    claim the same alias."""
    if not _is_macos():
        return "127.0.0.1"
    _ALLOC_LOCK_PATH.parent.mkdir(parents=True, exist_ok=True)
    with open(_ALLOC_LOCK_PATH, "w") as lf:
        fcntl.flock(lf, fcntl.LOCK_EX)
        return _allocate_locked()
 def _allocate_locked() -> str:
    in_use = _aliases_in_use()
    for ip in _pool_addresses():
        if ip not in in_use:
@@ -27,11 +27,13 @@ from __future__ import annotations
 import shutil
 import subprocess
 import time
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Mapping, Sequence
 _SMOLVM = "smolvm"
@@ -197,6 +199,34 @@ def machine_exec(
    )
 def wait_exec_ready(name: str, *, timeout: float = 5.0) -> None:
    """Poll `machine exec true` until exit 0 or `timeout` elapses.
    Replaces `time.sleep(1.5)` after `machine_start`: libkrun's exec
    channel needs a brief warm-up before back-to-back exec calls are
    safe. Polling exits as soon as the channel is ready and fails
    loudly if the VM never responds."""
    deadline = time.monotonic() + timeout
    delay = 0.1
    while time.monotonic() < deadline:
        r = machine_exec(name, ["true"])
        if r.returncode == 0:
            return
        remaining = deadline - time.monotonic()
        if remaining <= 0:
            break
        time.sleep(min(delay, remaining))
        delay = min(delay * 2, 0.5)
    argv = ["smolvm", "machine", "exec", "--name", name, "--", "true"]
    raise SmolvmError(
        argv,
        subprocess.CompletedProcess(
            args=argv, returncode=-1, stdout="",
            stderr=f"exec channel not ready after {timeout:.0f}s — VM may have failed to boot.",
        ),
    )
 def machine_cp(src: str, dst: str) -> None:
    """`smolvm machine cp SRC DST`. Path syntax: `machine:path` to
    reference a path inside the VM, bare path for the host. Both
@@ -0,0 +1,221 @@
 # PRD 0032: Decompose smolmachines launch and harden bringup sequencing
 - **Status:** Active
 - **Author:** didericis-claude
 - **Created:** 2026-06-02
 - **Issue:** #122
 ## Summary
 Split `launch()` into named per-step helpers, replace the empirical
 `time.sleep(1.5)` with a readiness poll, and file-lock loopback alias
 allocation. Addresses the three actionable issues from the #117 hotspot
 review of `smolmachines/launch.py`.
 ## Problem
 ### 1. `launch()` step ordering
 `launch()` in `smolmachines/launch.py` is 207 lines. Seven sequenced
 steps are marked by numbered inline comments (`# 1. Reserve a loopback
 alias`, `# 2. Mint per-bottle CAs`, ...) — the sequencing is
 load-bearing (CA paths must be filled before the bundle spec is built;
 the bundle must be running before port discovery; the VM must be created
 before the allowlist is patched), but the dependencies are enforced only
 by linear ordering within one function. Adding a new daemon, changing
 the port-forward strategy, or debugging a bringup failure requires
 reading the whole function to understand what state each step produces.
 Each step is also not individually testable without mocking the entire
 surrounding context.
 ### 2. `time.sleep(1.5)` for libkrun exec-channel race
 After `machine_start`, back-to-back `machine_exec` calls occasionally
 hit a SIGKILL in libkrun's exec channel at ~100ms. The sleep is
 documented as "1.5s is empirically enough; provisioning already takes
 seconds so the wait is amortized." The failure mode if the sleep is
 insufficient: the filesystem-repair exec (`chown -R node:node /home/node`)
 is SIGKILLed silently, and the agent later bails with `ENOENT`/`EPERM`
 when Claude Code tries to write to `~/.claude.json`. A poll-until-ready
 loop is more robust than a fixed duration: it exits as soon as the exec
 channel is up, fails loudly with a timeout if the VM never becomes
 responsive, and is self-documenting about what it is waiting for.
 ### 3. Loopback alias allocation is not concurrent-safe
 `loopback_alias.allocate()` reads docker container state to determine
 which aliases are already in use, then returns the lowest free alias.
 There is no lock between that read and the bundle's `docker run` (which
 creates the container that will appear in future `docker ps` output). Two
 simultaneous bottle launches can both see the same alias as free and
 claim it, causing both bundles to bind on the same loopback IP. On macOS,
 where users occasionally start multiple agents in quick succession, this
 is a realistic failure mode.
 ## Non-goals
 - Removing `force_allowlist` / the `--allow-cidr` DB patch. That is a
  workaround for a smolvm 0.8.0 bug; removal is a one-liner when smolvm
  honors the CLI flag upstream.
 - Changing the ephemeral registry / crane detour in `local_registry.py`.
  Required by Docker Desktop's network topology.
 - Changing `_ensure_smolmachine`'s cache design. Cache invalidation by
  docker image ID works; issue #111 tracks a separate stale-sidecar
  concern.
 ## Design
 ### 1. Decompose `launch()` into named helpers
 Extract six focused helpers. `launch()` becomes a coordinator that calls
 them in order, passing the `ExitStack` for teardown registration:
 ```
 _allocate_resources(plan, stack) → (loopback_ip, network)
 ```
 Reserve the loopback alias, create the docker bridge network, register
 teardown callbacks for both.
 ```
 _mint_certs(plan) → plan
 ```
 Pipelock TLS init (always). Egress TLS init when `plan.egress_plan.routes`
 is non-empty. Returns the plan with CA paths filled via
 `dataclasses.replace`.
 ```
 _start_bundle(plan, network, loopback_ip, stack) → plan
 ```
 Build the `BundleLaunchSpec`, resolve token env, start the bundle
 container, register teardown. Returns the plan with `bundle_spec` updated
 (or unchanged if no plan field carries it — callers consume `bundle_spec`
 directly from this call's return value if needed).
 ```
 _discover_urls(plan, loopback_ip) → plan
 ```
 Look up host-side ports for the published container ports; assemble
 `agent_proxy_url`, `agent_git_gate_host`, `agent_supervise_url`; stamp
 them onto the plan and into `guest_env`.
 ```
 _launch_vm(plan, agent_from_path, stack) → None
 ```
 `machine_create` + `force_allowlist` + `machine_start`. Register
 `machine_stop` and `machine_delete` teardown callbacks on the stack.
 ```
 _init_vm(plan) → None
 ```
 Filesystem-repair exec (`chown`/`chmod`) followed by
 `_wait_exec_ready()`.
 `launch()` reduces to:
 ```python
 loopback_ip, network = _allocate_resources(plan, stack)
 plan = _mint_certs(plan)
 plan = _start_bundle(plan, network, loopback_ip, stack)
 plan = _discover_urls(plan, loopback_ip)
 agent_from_path = _ensure_smolmachine(plan.agent_image_ref,
                                      dockerfile=plan.agent_dockerfile_path)
 _launch_vm(plan, agent_from_path, stack)
 _init_vm(plan)
 prompt_path = provision(plan, plan.machine_name)
 yield SmolmachinesBottle(...)
 ```
 Each helper's inputs and outputs are explicit; each is independently
 testable with a minimal set of mocks.
 ### 2. Replace `time.sleep(1.5)` with `_wait_exec_ready`
 Add to `smolvm.py`:
 ```python
 def wait_exec_ready(name: str, *, timeout: float = 5.0) -> None:
    """Poll until `machine exec true` exits 0 or `timeout` elapses.
    Replaces a fixed sleep after machine_start for the libkrun
    exec-channel warm-up race."""
    deadline = time.monotonic() + timeout
    delay = 0.1
    while time.monotonic() < deadline:
        r = machine_exec(name, ["true"])
        if r.returncode == 0:
            return
        remaining = deadline - time.monotonic()
        if remaining <= 0:
            break
        time.sleep(min(delay, remaining))
        delay = min(delay * 2, 0.5)
    die(
        f"smolvm machine {name!r}: exec channel not ready after "
        f"{timeout:.0f}s — VM may have failed to boot."
    )
 ```
 `_init_vm` calls `wait_exec_ready` after the chown/chmod exec instead of
 `time.sleep(1.5)`. The `time` import in `launch.py` is removed.
 ### 3. File-lock loopback alias allocation
 Add to `loopback_alias.py`:
 ```python
 import fcntl
 _ALLOC_LOCK_PATH = Path.home() / ".cache" / "bot-bottle" / "smolmachines.lock"
 def allocate(slug: str) -> str:
    if not _is_macos():
        return "127.0.0.1"
    _ALLOC_LOCK_PATH.parent.mkdir(parents=True, exist_ok=True)
    with open(_ALLOC_LOCK_PATH, "w") as lf:
        fcntl.flock(lf, fcntl.LOCK_EX)
        return _allocate_locked(slug)
 def _allocate_locked(slug: str) -> str:
    in_use = _aliases_in_use()
    for ip in _pool_addresses():
        if ip not in in_use:
            return ip
    die(...)
    return ""
 ```
 The lock is held only for the duration of `_aliases_in_use()` + the
 `allocate` return. The bundle's `docker run` runs after the lock is
 released. This is sufficient: once `docker run` returns, the container
 is visible in docker state and future `allocate()` calls will see it.
 The remaining window (lock released → container appears in docker state)
 is narrowed from "the entire bringup sequence" to "a single subprocess
 call," making a collision between two concurrent launches effectively
 impossible in practice.
 The lock is a no-op on Linux (the `_is_macos()` early-return fires
 before the lock path is opened).
 ## Test impact
 - Unit tests for each extracted helper can mock one subprocess boundary
  at a time (smolvm, docker, pipelock TLS init) without wiring the full
  `launch()` ExitStack.
 - `wait_exec_ready` needs a test with `machine_exec` stubbed to return
  non-zero N times before 0 — verifies the backoff loop and the timeout
  die path.
 - `allocate` tests are unchanged in shape; the lock is acquired and
  released within the call so tests don't need to be aware of it.
 ## Implementation chunks
 1. **PRD (this commit).** Sets the design.
 2. **Decompose `launch()`.**
 3. **Replace sleep with `wait_exec_ready`.**
 4. **File-lock `allocate()`.**
 5. **Tests.** Unit tests for each helper; `wait_exec_ready` backoff + timeout.
 ## References
 - Issue #122: Decompose smolmachines launch and harden bringup sequencing.
 - Issue #117: Complexity hotspots — source of the smolmachines/launch.py finding.
 - Issue #111: Smolmachine sidecar doesn't reliably get refreshed (separate, not addressed here).
@@ -11,6 +11,7 @@ import json
 import sqlite3
 import subprocess
 import tempfile
 import threading
 import unittest
 from pathlib import Path
 from unittest.mock import patch
@@ -144,6 +145,55 @@ class TestAllocate(unittest.TestCase):
                loopback_alias.allocate("demo-overflow")
 class TestAllocateLock(unittest.TestCase):
    """allocate() on macOS acquires a file lock so concurrent calls
    serialise rather than racing on docker state."""
    def test_acquires_exclusive_lock_on_macos(self):
        import fcntl as fcntl_mod
        flock_calls: list[int] = []
        def record_flock(fd, op):
            flock_calls.append(op)
        with tempfile.TemporaryDirectory() as tmp:
            lock_path = Path(tmp) / "smolmachines.lock"
            with patch.object(loopback_alias, "_is_macos", return_value=True), \
                 patch.object(loopback_alias, "_ALLOC_LOCK_PATH", lock_path), \
                 patch.object(loopback_alias, "_aliases_in_use", return_value=set()), \
                 patch.object(loopback_alias.fcntl, "flock",
                              side_effect=record_flock):
                loopback_alias.allocate("demo")
        self.assertIn(fcntl_mod.LOCK_EX, flock_calls)
    def test_no_lock_on_linux(self):
        # Linux early-returns before touching the lock file.
        with patch.object(loopback_alias, "_is_macos", return_value=False), \
             patch.object(loopback_alias.fcntl, "flock") as flock:
            loopback_alias.allocate("demo")
        flock.assert_not_called()
    def test_sequential_allocations_with_shared_lock_are_serialised(self):
        # Two sequential calls share the same lock file. The second
        # call sees {127.0.0.16} in use (as if the first caller's
        # docker run completed between the two lock acquisitions) and
        # returns the next alias.
        in_use_seq = [set(), {"127.0.0.16"}]
        with tempfile.TemporaryDirectory() as tmp:
            lock_path = Path(tmp) / "smolmachines.lock"
            results: list[str] = []
            for _ in range(2):
                with patch.object(loopback_alias, "_is_macos", return_value=True), \
                     patch.object(loopback_alias, "_ALLOC_LOCK_PATH", lock_path), \
                     patch.object(loopback_alias, "_aliases_in_use",
                                  return_value=in_use_seq.pop(0)):
                    results.append(loopback_alias.allocate("demo"))
        self.assertEqual(["127.0.0.16", "127.0.0.17"], results)
 class TestAliasInUseDetection(unittest.TestCase):
    """`_aliases_in_use` inspects every running bundle and pulls
    each container's port-binding `HostIp` out. The detection has
@@ -12,6 +12,7 @@ import unittest
 from pathlib import Path
 from unittest.mock import patch
 from bot_bottle.backend.smolmachines import smolvm as smolvm_mod
 from bot_bottle.backend.smolmachines.smolvm import (
    SmolvmError,
    SmolvmRunResult,
@@ -23,6 +24,7 @@ from bot_bottle.backend.smolmachines.smolvm import (
    machine_start,
    machine_stop,
    pack_create,
    wait_exec_ready,
 )
@@ -204,6 +206,43 @@ class TestErrorPath(unittest.TestCase):
        self.assertEqual(SmolvmRunResult(42, "", "nope"), r)
 class TestWaitExecReady(unittest.TestCase):
    """wait_exec_ready polls machine_exec(name, ["true"]) until it
    returns 0, then exits. On timeout it calls die()."""
    def test_returns_immediately_when_exec_succeeds_first_try(self):
        with patch.object(smolvm_mod, "machine_exec",
                          return_value=SmolvmRunResult(0, "", "")) as m:
            wait_exec_ready("vm-x")
        m.assert_called_once_with("vm-x", ["true"])
    def test_retries_on_nonzero_and_returns_on_success(self):
        results = [
            SmolvmRunResult(1, "", "not ready"),
            SmolvmRunResult(1, "", "not ready"),
            SmolvmRunResult(0, "", ""),
        ]
        with patch.object(smolvm_mod, "machine_exec",
                          side_effect=results) as m, \
             patch.object(smolvm_mod.time, "sleep"):
            wait_exec_ready("vm-x")
        self.assertEqual(3, m.call_count)
    def test_raises_smolvm_error_on_timeout(self):
        # machine_exec always returns non-zero; monotonic advances past
        # the deadline after the first sleep so the loop exits.
        ticks = [0.0, 0.0, 10.0]  # third call puts us past deadline
        with patch.object(smolvm_mod, "machine_exec",
                          return_value=SmolvmRunResult(1, "", "")), \
             patch.object(smolvm_mod.time, "monotonic",
                          side_effect=ticks), \
             patch.object(smolvm_mod.time, "sleep"):
            with self.assertRaises(SmolvmError) as cm:
                wait_exec_ready("vm-x", timeout=5.0)
        self.assertIn("vm-x", str(cm.exception))
        self.assertIn("not ready", str(cm.exception))
 class TestIsAvailable(unittest.TestCase):
    def test_true_when_on_path(self):
        with patch(