refactor(smolmachines): decompose launch(), add wait_exec_ready, file-lock allocate() (PRD 0032)

Decompose the 207-line launch() into six named helpers: _allocate_resources, _mint_certs, _start_bundle, _discover_urls, _launch_vm, _init_vm. Each has explicit inputs/outputs and is independently testable. Replace time.sleep(1.5) with smolvm.wait_exec_ready(), which polls `machine exec true` with exponential backoff. Exits as soon as the exec channel is ready; dies loudly with a timeout message instead of silently leaving the VM in an unknown state. File-lock loopback_alias.allocate() with fcntl.flock(LOCK_EX) so concurrent bottle launches can't race on docker state and claim the same alias.
2026-06-02 06:23:39 +00:00
parent fe97b6014d
commit 0d922371b0
5 changed files with 326 additions and 193 deletions
@@ -21,7 +21,6 @@ from __future__ import annotations

 import dataclasses
 import os
-import time
 from contextlib import ExitStack, contextmanager
 from pathlib import Path
 from typing import Callable, Generator
@@ -94,200 +93,23 @@ def launch(
    via the ExitStack."""
    stack = ExitStack()
    try:
-        # 1. Reserve a loopback alias for this bottle. macOS only
-        # routes 127.0.0.1 by default; the per-bottle alias is
-        # what bundles the docker port-publishes and TSI allowlist
-        # against, so this bottle can't reach other bottles' (or
-        # other host services') ports on the loopback. Lazy
-        # sudo-driven on first use per boot. No-op on Linux.
-        _loopback.ensure_pool()
-        loopback_ip = _loopback.allocate(plan.slug)
+        loopback_ip, network = _allocate_resources(plan, stack)
+        plan = _mint_certs(plan)
+        plan = _start_bundle(plan, network, loopback_ip, stack)
+        plan = _discover_urls(plan, loopback_ip)

-        # 2. Per-bottle docker bridge.
-        network = _bundle.bundle_network_name(plan.slug)
-        _bundle.create_bundle_network(network, plan.bundle_subnet, plan.bundle_gateway)
-        stack.callback(_bundle.remove_bundle_network, network)
-
-        # 2. Mint per-bottle CAs and update the inner Plans with
-        # their launch-time paths. pipelock always runs in the
-        # bundle; egress's CA is only minted when the bottle
-        # declares routes (otherwise egress runs idle without
-        # MITM and the CA files would be unused).
-        ca_cert_host, ca_key_host = pipelock_tls_init(plan.proxy_plan.yaml_path.parent)
-        proxy_plan = dataclasses.replace(
-            plan.proxy_plan,
-            ca_cert_host_path=ca_cert_host,
-            ca_key_host_path=ca_key_host,
-        )
-        egress_plan = plan.egress_plan
-        if egress_plan.routes:
-            egress_ca_host, egress_ca_cert_only = egress_tls_init(
-                plan.egress_plan.routes_path.parent,
-            )
-            egress_plan = dataclasses.replace(
-                egress_plan,
-                mitmproxy_ca_host_path=egress_ca_host,
-                mitmproxy_ca_cert_only_host_path=egress_ca_cert_only,
-                pipelock_ca_host_path=ca_cert_host,
-                # On smolmachines, egress's upstream is pipelock
-                # on the bundle's localhost — they're in the same
-                # container's network namespace.
-                pipelock_proxy_url=BUNDLE_LOCAL_PIPELOCK_URL,
-            )
-        plan = dataclasses.replace(
-            plan, proxy_plan=proxy_plan, egress_plan=egress_plan,
-        )
-
-        # 3. Build the BundleLaunchSpec from the (now-resolved)
-        # inner Plans: daemon subset, env, bind-mounts, and the
-        # loopback alias to bind published ports against. The
-        # spec's ports_to_publish list expands depending on which
-        # daemons the agent needs to reach from the smolvm guest.
-        bundle_spec = _bundle_launch_spec(plan, network, loopback_ip)
-        token_env = _resolve_token_env(plan, dict(os.environ))
-        _bundle.ensure_bundle_image(bundle_spec.image)
-        _bundle.start_bundle(bundle_spec, env={**os.environ, **token_env})
-        stack.callback(_bundle.stop_bundle, plan.slug)
-
-        # 4. Discover the host-side ports docker assigned for the
-        # bundle's published container ports, and bind the
-        # agent's URLs to `<loopback_ip>:<host port>`. Docker
-        # container IPs (192.168.x.x in the daemon's bridge)
-        # aren't reachable from the smolvm guest on macOS — TSI
-        # uses macOS networking, and macOS sees the daemon's
-        # bridge via the published-port loopback forward only.
-        #
-        # Proxy hop order matches the docker backend: when the
-        # bottle declares egress routes, the agent's first hop is
-        # egress (for token injection), then pipelock. Without
-        # routes, the agent dials pipelock directly. Whichever
-        # one is "agent-facing" is the daemon whose port we
-        # publish on host loopback; the other stays bundle-
-        # internal as the upstream proxy.
-        if plan.egress_plan.routes:
-            agent_facing_port = _EGRESS_PORT
-        else:
-            agent_facing_port = _PIPELOCK_PORT
-        agent_facing_host_port = _bundle.bundle_host_port(
-            plan.slug, agent_facing_port, host_ip=loopback_ip,
-        )
-        agent_proxy_url = f"http://{loopback_ip}:{agent_facing_host_port}"
-        agent_git_gate_host = ""
-        if plan.git_gate_plan.upstreams:
-            git_gate_host_port = _bundle.bundle_host_port(
-                plan.slug, _GIT_HTTP_PORT, host_ip=loopback_ip,
-            )
-            agent_git_gate_host = f"{loopback_ip}:{git_gate_host_port}"
-        agent_supervise_url = ""
-        if plan.supervise_plan is not None:
-            supervise_host_port = _bundle.bundle_host_port(
-                plan.slug, _SUPERVISE_PORT, host_ip=loopback_ip,
-            )
-            agent_supervise_url = f"http://{loopback_ip}:{supervise_host_port}/"
-
-        # Stamp the URLs onto the plan + guest_env. provision_git
-        # and provision_supervise read the plan fields; the agent
-        # reads guest_env on every exec_agent.
-        #
-        # NO_PROXY has to include the per-bottle loopback alias —
-        # otherwise claude's HTTPS_PROXY catches direct calls to
-        # the supervise URL (`http://<alias>:<port>/`) and proxies
-        # them through egress, which has no route for the alias
-        # and rejects with "Failed to connect". The smolmachines
-        # git-gate URL uses smart HTTP, so it also has to bypass
-        # the agent's HTTP_PROXY and go straight to the host-
-        # published git HTTP endpoint. Append rather than overwrite
-        # so prepare.py's
-        # `localhost,127.0.0.1` baseline stays in place.
-        existing_no_proxy = plan.guest_env.get("NO_PROXY", "localhost,127.0.0.1")
-        guest_env = {
-            **plan.guest_env,
-            "HTTPS_PROXY": agent_proxy_url,
-            "HTTP_PROXY":  agent_proxy_url,
-            "NO_PROXY":    f"{existing_no_proxy},{loopback_ip}",
-        }
-        if agent_git_gate_host:
-            guest_env["GIT_GATE_URL"] = f"http://{agent_git_gate_host}"
-        if agent_supervise_url:
-            guest_env["MCP_SUPERVISE_URL"] = agent_supervise_url
-        plan = dataclasses.replace(
-            plan,
-            guest_env=guest_env,
-            agent_proxy_url=agent_proxy_url,
-            agent_git_gate_host=agent_git_gate_host,
-            agent_supervise_url=agent_supervise_url,
-        )
-
-        # 5. Build the agent image and pack it into a
-        # `.smolmachine` artifact (or hit the per-Dockerfile-digest
-        # cache). Runs here, not in prepare, so the docker-build
-        # output doesn't garble the dashboard's preflight modal:
-        # both the curses-endwin path and the tmux pane-routing
-        # path redirect stderr around `launch` already.
+        # Build the agent image and pack it into a `.smolmachine`
+        # artifact (or hit the per-Dockerfile-digest cache). Runs
+        # here, not in prepare, so the docker-build output doesn't
+        # garble the dashboard's preflight modal.
        agent_from_path = _ensure_smolmachine(
            plan.agent_image_ref,
            dockerfile=plan.agent_dockerfile_path,
        )

-        # smolvm VM. --from carries the pre-packed .smolmachine
-        # artifact; --allow-cidr + -e carry the per-bottle TSI
-        # allowlist + env. The allowlist is the per-bottle
-        # loopback alias — narrowing it to one /32 keeps the
-        # agent from reaching other host loopback services or
-        # other bottles' published ports. Smolfile isn't usable
-        # here — smolvm 0.8.0 makes `--from` and `--smolfile`
-        # mutually exclusive.
-        _smolvm.machine_create(
-            plan.machine_name,
-            from_path=agent_from_path,
-            allow_cidrs=[f"{loopback_ip}/32"],
-            env=plan.guest_env,
-        )
-        stack.callback(_smolvm.machine_delete, plan.machine_name)
-        # Workaround smolvm 0.8.0: `--allow-cidr` is silently
-        # dropped when combined with `--from`. Patch the persisted
-        # state DB to set the allowlist before start so the booted
-        # VM's TSI actually enforces. See loopback_alias's module
-        # docstring for the investigation that led here.
-        _loopback.force_allowlist(plan.machine_name, [f"{loopback_ip}/32"])
-        _smolvm.machine_start(plan.machine_name)
-        stack.callback(_smolvm.machine_stop, plan.machine_name)
+        _launch_vm(plan, agent_from_path, loopback_ip, stack)
+        _init_vm(plan)

-        # 6. Repair filesystem ownership + perms that smolvm's
-        # pack process remapped to the host invoker's uid (501
-        # on macOS) rather than preserving the image's expected
-        # ownership.
-        #
-        #  - /home/node → node:node so the node user can write
-        #    its own dotfiles (claude appendFileSync on
-        #    ~/.claude.json otherwise bails with ENOENT/EPERM
-        #    and the TUI hangs without surfacing the error).
-        #  - /tmp + /var/tmp → root:root mode 1777 so non-root
-        #    processes can create their per-uid scratch dirs
-        #    (claude-code creates /tmp/claude-<uid>/ as soon as
-        #    it spawns a Bash tool call).
-        #
-        # All folded into one sh -c so we only pay one
-        # machine_exec round trip — back-to-back exec calls
-        # right after machine_start hit a SIGKILL race in
-        # libkrun's exec channel (see provision_ca for the
-        # other half of this same workaround).
-        _smolvm.machine_exec(plan.machine_name, [
-            "sh", "-c",
-            "chown -R node:node /home/node && "
-            "chown root:root /tmp /var/tmp && "
-            "chmod 1777 /tmp /var/tmp",
-        ])
-
-        # Wait briefly for the VM to settle. Back-to-back smolvm
-        # machine_exec calls immediately after machine_start
-        # occasionally SIGKILL the in-VM child at ~100ms (looks
-        # like a VM warm-up race in libkrun's exec channel).
-        # 1.5s is empirically enough to dodge it; provisioning
-        # already takes seconds so the wait is amortized.
-        time.sleep(1.5)
-
-        # 7. Provision (CA / prompt / skills / git / supervise).
        prompt_path = provision(plan, plan.machine_name)

        yield SmolmachinesBottle(
@@ -301,6 +123,180 @@ def launch(
        stack.close()


+def _allocate_resources(
+    plan: SmolmachinesBottlePlan,
+    stack: ExitStack,
+) -> tuple[str, str]:
+    """Reserve a loopback alias and create the per-bottle docker bridge.
+
+    macOS only routes 127.0.0.1 by default; the per-bottle alias
+    scopes TSI's allowlist to this bottle's published ports so the
+    agent can't reach other bottles' or host services' ports on
+    loopback. No-op on Linux."""
+    _loopback.ensure_pool()
+    loopback_ip = _loopback.allocate(plan.slug)
+    network = _bundle.bundle_network_name(plan.slug)
+    _bundle.create_bundle_network(network, plan.bundle_subnet, plan.bundle_gateway)
+    stack.callback(_bundle.remove_bundle_network, network)
+    return loopback_ip, network
+
+
+def _mint_certs(plan: SmolmachinesBottlePlan) -> SmolmachinesBottlePlan:
+    """Mint per-bottle CAs and return the plan with CA paths filled.
+
+    Pipelock always runs in the bundle. Egress's CA is only minted
+    when the bottle declares routes — otherwise egress runs idle
+    without MITM and the CA files would be unused."""
+    ca_cert_host, ca_key_host = pipelock_tls_init(plan.proxy_plan.yaml_path.parent)
+    proxy_plan = dataclasses.replace(
+        plan.proxy_plan,
+        ca_cert_host_path=ca_cert_host,
+        ca_key_host_path=ca_key_host,
+    )
+    egress_plan = plan.egress_plan
+    if egress_plan.routes:
+        egress_ca_host, egress_ca_cert_only = egress_tls_init(
+            plan.egress_plan.routes_path.parent,
+        )
+        egress_plan = dataclasses.replace(
+            egress_plan,
+            mitmproxy_ca_host_path=egress_ca_host,
+            mitmproxy_ca_cert_only_host_path=egress_ca_cert_only,
+            pipelock_ca_host_path=ca_cert_host,
+            # On smolmachines, egress's upstream is pipelock on the
+            # bundle's localhost — they're in the same container's
+            # network namespace.
+            pipelock_proxy_url=BUNDLE_LOCAL_PIPELOCK_URL,
+        )
+    return dataclasses.replace(plan, proxy_plan=proxy_plan, egress_plan=egress_plan)
+
+
+def _start_bundle(
+    plan: SmolmachinesBottlePlan,
+    network: str,
+    loopback_ip: str,
+    stack: ExitStack,
+) -> SmolmachinesBottlePlan:
+    """Build the BundleLaunchSpec, resolve token env, start the
+    sidecar bundle container, and register teardown."""
+    bundle_spec = _bundle_launch_spec(plan, network, loopback_ip)
+    token_env = _resolve_token_env(plan, dict(os.environ))
+    _bundle.ensure_bundle_image(bundle_spec.image)
+    _bundle.start_bundle(bundle_spec, env={**os.environ, **token_env})
+    stack.callback(_bundle.stop_bundle, plan.slug)
+    return plan
+
+
+def _discover_urls(
+    plan: SmolmachinesBottlePlan,
+    loopback_ip: str,
+) -> SmolmachinesBottlePlan:
+    """Discover host-side ports for published container ports and
+    return the plan with URLs + guest_env stamped in.
+
+    Docker container IPs (192.168.x.x in the daemon's bridge)
+    aren't reachable from the smolvm guest on macOS — TSI uses
+    macOS networking, and macOS sees the daemon's bridge via the
+    published-port loopback forward only.
+
+    Proxy hop order: when the bottle declares egress routes, the
+    agent's first hop is egress (for token injection), then
+    pipelock. Without routes, the agent dials pipelock directly.
+    NO_PROXY includes the per-bottle loopback alias so the
+    supervise + git-gate URLs bypass HTTPS_PROXY."""
+    if plan.egress_plan.routes:
+        agent_facing_port = _EGRESS_PORT
+    else:
+        agent_facing_port = _PIPELOCK_PORT
+    agent_facing_host_port = _bundle.bundle_host_port(
+        plan.slug, agent_facing_port, host_ip=loopback_ip,
+    )
+    agent_proxy_url = f"http://{loopback_ip}:{agent_facing_host_port}"
+
+    agent_git_gate_host = ""
+    if plan.git_gate_plan.upstreams:
+        git_gate_host_port = _bundle.bundle_host_port(
+            plan.slug, _GIT_HTTP_PORT, host_ip=loopback_ip,
+        )
+        agent_git_gate_host = f"{loopback_ip}:{git_gate_host_port}"
+
+    agent_supervise_url = ""
+    if plan.supervise_plan is not None:
+        supervise_host_port = _bundle.bundle_host_port(
+            plan.slug, _SUPERVISE_PORT, host_ip=loopback_ip,
+        )
+        agent_supervise_url = f"http://{loopback_ip}:{supervise_host_port}/"
+
+    existing_no_proxy = plan.guest_env.get("NO_PROXY", "localhost,127.0.0.1")
+    guest_env = {
+        **plan.guest_env,
+        "HTTPS_PROXY": agent_proxy_url,
+        "HTTP_PROXY":  agent_proxy_url,
+        "NO_PROXY":    f"{existing_no_proxy},{loopback_ip}",
+    }
+    if agent_git_gate_host:
+        guest_env["GIT_GATE_URL"] = f"http://{agent_git_gate_host}"
+    if agent_supervise_url:
+        guest_env["MCP_SUPERVISE_URL"] = agent_supervise_url
+
+    return dataclasses.replace(
+        plan,
+        guest_env=guest_env,
+        agent_proxy_url=agent_proxy_url,
+        agent_git_gate_host=agent_git_gate_host,
+        agent_supervise_url=agent_supervise_url,
+    )
+
+
+def _launch_vm(
+    plan: SmolmachinesBottlePlan,
+    agent_from_path: Path,
+    loopback_ip: str,
+    stack: ExitStack,
+) -> None:
+    """Create, patch, and start the smolvm VM; register teardown.
+
+    --allow-cidr is the per-bottle loopback alias so the guest can
+    only reach this bottle's bundle ports. force_allowlist patches
+    smolvm 0.8.0's silent-drop of --allow-cidr when combined with
+    --from. Smolfile isn't usable here — smolvm 0.8.0 makes --from
+    and --smolfile mutually exclusive."""
+    _smolvm.machine_create(
+        plan.machine_name,
+        from_path=agent_from_path,
+        allow_cidrs=[f"{loopback_ip}/32"],
+        env=plan.guest_env,
+    )
+    stack.callback(_smolvm.machine_delete, plan.machine_name)
+    # Workaround smolvm 0.8.0: `--allow-cidr` is silently dropped
+    # when combined with `--from`. Patch the persisted state DB
+    # before start so the booted VM's TSI actually enforces.
+    _loopback.force_allowlist(plan.machine_name, [f"{loopback_ip}/32"])
+    _smolvm.machine_start(plan.machine_name)
+    stack.callback(_smolvm.machine_stop, plan.machine_name)
+
+
+def _init_vm(plan: SmolmachinesBottlePlan) -> None:
+    """Repair filesystem ownership and wait for exec channel readiness.
+
+    Ownership repair: smolvm's pack process remaps files to the host
+    invoker's uid (501 on macOS). /home/node must be node:node so
+    Claude Code can write ~/.claude.json; /tmp + /var/tmp need root
+    mode 1777 so non-root processes can create per-uid scratch dirs.
+    All folded into one sh -c to avoid back-to-back exec calls
+    immediately after machine_start (libkrun exec-channel race).
+
+    wait_exec_ready polls until the exec channel is ready for the
+    subsequent provision calls, replacing the empirical sleep."""
+    _smolvm.machine_exec(plan.machine_name, [
+        "sh", "-c",
+        "chown -R node:node /home/node && "
+        "chown root:root /tmp /var/tmp && "
+        "chmod 1777 /tmp /var/tmp",
+    ])
+    _smolvm.wait_exec_ready(plan.machine_name)
+
+
 def _bundle_launch_spec(
    plan: SmolmachinesBottlePlan, network: str, loopback_ip: str,
 ) -> _bundle.BundleLaunchSpec:
@@ -324,10 +320,9 @@ def _bundle_launch_spec(
    # is "agent-facing" gets its port published on the host
    # loopback (see `_ensure_smolmachine`'s discovery loop) and the
    # other stays bundle-internal. The bundle is NOT reachable by
-    # bridge IP from the smolvm guest, so the
-    # PRD-0023-chunk-3 EGRESS_LISTEN_HOST=127.0.0.1 mitigation
-    # isn't needed: the agent can only dial whatever daemon's
-    # host port we publish, period.
+    # bridge IP from the smolvm guest on macOS — TSI uses macOS
+    # networking, and macOS sees the daemon's bridge via the
+    # published-port loopback forward only.

    # --- pipelock ---------------------------------------------
    pp = plan.proxy_plan
@@ -45,6 +45,7 @@ alias gets handed to a new bottle."""

 from __future__ import annotations

+import fcntl
 import json
 import os
 import platform
@@ -83,6 +84,14 @@ _POOL_START = 16
 _POOL_END = 31  # inclusive


+# File lock that serialises concurrent allocate() calls so two
+# simultaneous launches can't read the same docker state and claim
+# the same alias. Narrowed to the allocate() call itself; docker run
+# runs after the lock is released. Once the container is running it
+# appears in docker state and future allocate() calls will see it.
+_ALLOC_LOCK_PATH = Path.home() / ".cache" / "bot-bottle" / "smolmachines.lock"
+
+
 # Loopback aliases pool: 127.0.0.<start>..127.0.0.<end>.
 def _pool_addresses() -> list[str]:
    return [f"127.0.0.{i}" for i in range(_POOL_START, _POOL_END + 1)]
@@ -179,9 +188,20 @@ def allocate(slug: str) -> str:
    On non-macOS the whole `127.0.0.0/8` is loopback by default;
    `127.0.0.1` is fine to share and we skip the alias dance.
    This still returns a deterministic address so launch.py's
-    callers don't have to branch on platform."""
+    callers don't have to branch on platform.
+
+    An exclusive file lock serialises concurrent calls so two
+    simultaneous launches don't read the same docker state and
+    claim the same alias."""
    if not _is_macos():
        return "127.0.0.1"
+    _ALLOC_LOCK_PATH.parent.mkdir(parents=True, exist_ok=True)
+    with open(_ALLOC_LOCK_PATH, "w") as lf:
+        fcntl.flock(lf, fcntl.LOCK_EX)
+        return _allocate_locked()
+
+
+def _allocate_locked() -> str:
    in_use = _aliases_in_use()
    for ip in _pool_addresses():
        if ip not in in_use:
@@ -27,10 +27,13 @@ from __future__ import annotations

 import shutil
 import subprocess
+import time
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Mapping, Sequence

+from ...log import die
+

 _SMOLVM = "smolvm"

@@ -197,6 +200,30 @@ def machine_exec(
    )


+def wait_exec_ready(name: str, *, timeout: float = 5.0) -> None:
+    """Poll `machine exec true` until exit 0 or `timeout` elapses.
+
+    Replaces `time.sleep(1.5)` after `machine_start`: libkrun's exec
+    channel needs a brief warm-up before back-to-back exec calls are
+    safe. Polling exits as soon as the channel is ready and fails
+    loudly if the VM never responds."""
+    deadline = time.monotonic() + timeout
+    delay = 0.1
+    while time.monotonic() < deadline:
+        r = machine_exec(name, ["true"])
+        if r.returncode == 0:
+            return
+        remaining = deadline - time.monotonic()
+        if remaining <= 0:
+            break
+        time.sleep(min(delay, remaining))
+        delay = min(delay * 2, 0.5)
+    die(
+        f"smolvm machine {name!r}: exec channel not ready after "
+        f"{timeout:.0f}s — VM may have failed to boot."
+    )
+
+
 def machine_cp(src: str, dst: str) -> None:
    """`smolvm machine cp SRC DST`. Path syntax: `machine:path` to
    reference a path inside the VM, bare path for the host. Both
@@ -11,6 +11,7 @@ import json
 import sqlite3
 import subprocess
 import tempfile
+import threading
 import unittest
 from pathlib import Path
 from unittest.mock import patch
@@ -144,6 +145,55 @@ class TestAllocate(unittest.TestCase):
                loopback_alias.allocate("demo-overflow")


+class TestAllocateLock(unittest.TestCase):
+    """allocate() on macOS acquires a file lock so concurrent calls
+    serialise rather than racing on docker state."""
+
+    def test_acquires_exclusive_lock_on_macos(self):
+        import fcntl as fcntl_mod
+        flock_calls: list[int] = []
+
+        def record_flock(fd, op):
+            flock_calls.append(op)
+
+        with tempfile.TemporaryDirectory() as tmp:
+            lock_path = Path(tmp) / "smolmachines.lock"
+            with patch.object(loopback_alias, "_is_macos", return_value=True), \
+                 patch.object(loopback_alias, "_ALLOC_LOCK_PATH", lock_path), \
+                 patch.object(loopback_alias, "_aliases_in_use", return_value=set()), \
+                 patch.object(loopback_alias.fcntl, "flock",
+                              side_effect=record_flock):
+                loopback_alias.allocate("demo")
+
+        self.assertIn(fcntl_mod.LOCK_EX, flock_calls)
+
+    def test_no_lock_on_linux(self):
+        # Linux early-returns before touching the lock file.
+        with patch.object(loopback_alias, "_is_macos", return_value=False), \
+             patch.object(loopback_alias.fcntl, "flock") as flock:
+            loopback_alias.allocate("demo")
+        flock.assert_not_called()
+
+    def test_sequential_allocations_with_shared_lock_are_serialised(self):
+        # Two sequential calls share the same lock file. The second
+        # call sees {127.0.0.16} in use (as if the first caller's
+        # docker run completed between the two lock acquisitions) and
+        # returns the next alias.
+        in_use_seq = [set(), {"127.0.0.16"}]
+
+        with tempfile.TemporaryDirectory() as tmp:
+            lock_path = Path(tmp) / "smolmachines.lock"
+            results: list[str] = []
+            for _ in range(2):
+                with patch.object(loopback_alias, "_is_macos", return_value=True), \
+                     patch.object(loopback_alias, "_ALLOC_LOCK_PATH", lock_path), \
+                     patch.object(loopback_alias, "_aliases_in_use",
+                                  return_value=in_use_seq.pop(0)):
+                    results.append(loopback_alias.allocate("demo"))
+
+        self.assertEqual(["127.0.0.16", "127.0.0.17"], results)
+
+
 class TestAliasInUseDetection(unittest.TestCase):
    """`_aliases_in_use` inspects every running bundle and pulls
    each container's port-binding `HostIp` out. The detection has
@@ -12,6 +12,7 @@ import unittest
 from pathlib import Path
 from unittest.mock import patch

+from bot_bottle.backend.smolmachines import smolvm as smolvm_mod
 from bot_bottle.backend.smolmachines.smolvm import (
    SmolvmError,
    SmolvmRunResult,
@@ -23,6 +24,7 @@ from bot_bottle.backend.smolmachines.smolvm import (
    machine_start,
    machine_stop,
    pack_create,
+    wait_exec_ready,
 )


@@ -204,6 +206,45 @@ class TestErrorPath(unittest.TestCase):
        self.assertEqual(SmolvmRunResult(42, "", "nope"), r)


+class TestWaitExecReady(unittest.TestCase):
+    """wait_exec_ready polls machine_exec(name, ["true"]) until it
+    returns 0, then exits. On timeout it calls die()."""
+
+    def test_returns_immediately_when_exec_succeeds_first_try(self):
+        with patch.object(smolvm_mod, "machine_exec",
+                          return_value=SmolvmRunResult(0, "", "")) as m:
+            wait_exec_ready("vm-x")
+        m.assert_called_once_with("vm-x", ["true"])
+
+    def test_retries_on_nonzero_and_returns_on_success(self):
+        results = [
+            SmolvmRunResult(1, "", "not ready"),
+            SmolvmRunResult(1, "", "not ready"),
+            SmolvmRunResult(0, "", ""),
+        ]
+        with patch.object(smolvm_mod, "machine_exec",
+                          side_effect=results) as m, \
+             patch.object(smolvm_mod.time, "sleep"):
+            wait_exec_ready("vm-x")
+        self.assertEqual(3, m.call_count)
+
+    def test_dies_on_timeout(self):
+        # machine_exec always returns non-zero; monotonic advances past
+        # the deadline after the first sleep so the loop exits.
+        ticks = [0.0, 0.0, 10.0]  # third call puts us past deadline
+        with patch.object(smolvm_mod, "machine_exec",
+                          return_value=SmolvmRunResult(1, "", "")), \
+             patch.object(smolvm_mod.time, "monotonic",
+                          side_effect=ticks), \
+             patch.object(smolvm_mod.time, "sleep"), \
+             patch.object(smolvm_mod, "die",
+                          side_effect=SystemExit("die")) as die_mock:
+            with self.assertRaises(SystemExit):
+                wait_exec_ready("vm-x", timeout=5.0)
+        die_mock.assert_called_once()
+        self.assertIn("vm-x", die_mock.call_args.args[0])
+
+
 class TestIsAvailable(unittest.TestCase):
    def test_true_when_on_path(self):
        with patch(