refactor(smolmachines): decompose launch(), add wait_exec_ready, file-lock allocate() (PRD 0032)

Decompose the 207-line launch() into six named helpers: _allocate_resources, _mint_certs, _start_bundle, _discover_urls, _launch_vm, _init_vm. Each has explicit inputs/outputs and is independently testable. Replace time.sleep(1.5) with smolvm.wait_exec_ready(), which polls `machine exec true` with exponential backoff. Exits as soon as the exec channel is ready; dies loudly with a timeout message instead of silently leaving the VM in an unknown state. File-lock loopback_alias.allocate() with fcntl.flock(LOCK_EX) so concurrent bottle launches can't race on docker state and claim the same alias.
2026-06-02 06:23:39 +00:00
parent fe97b6014d
commit 0d922371b0
5 changed files with 326 additions and 193 deletions
@@ -21,7 +21,6 @@ from __future__ import annotations
 import dataclasses
 import os
 import time
 from contextlib import ExitStack, contextmanager
 from pathlib import Path
 from typing import Callable, Generator
@@ -94,25 +93,60 @@ def launch(
    via the ExitStack."""
    stack = ExitStack()
    try:
-        # 1. Reserve a loopback alias for this bottle. macOS only
+        loopback_ip, network = _allocate_resources(plan, stack)
-        # routes 127.0.0.1 by default; the per-bottle alias is
+        plan = _mint_certs(plan)
-        # what bundles the docker port-publishes and TSI allowlist
+        plan = _start_bundle(plan, network, loopback_ip, stack)
-        # against, so this bottle can't reach other bottles' (or
+        plan = _discover_urls(plan, loopback_ip)
-        # other host services') ports on the loopback. Lazy
+
-        # sudo-driven on first use per boot. No-op on Linux.
+        # Build the agent image and pack it into a `.smolmachine`
        # artifact (or hit the per-Dockerfile-digest cache). Runs
        # here, not in prepare, so the docker-build output doesn't
        # garble the dashboard's preflight modal.
        agent_from_path = _ensure_smolmachine(
            plan.agent_image_ref,
            dockerfile=plan.agent_dockerfile_path,
        )
        _launch_vm(plan, agent_from_path, loopback_ip, stack)
        _init_vm(plan)
        prompt_path = provision(plan, plan.machine_name)
        yield SmolmachinesBottle(
            plan.machine_name,
            prompt_path=prompt_path,
            guest_env=plan.guest_env,
            agent_command=plan.agent_command,
            agent_prompt_mode=plan.agent_prompt_mode,
        )
    finally:
        stack.close()
 def _allocate_resources(
    plan: SmolmachinesBottlePlan,
    stack: ExitStack,
 ) -> tuple[str, str]:
    """Reserve a loopback alias and create the per-bottle docker bridge.
    macOS only routes 127.0.0.1 by default; the per-bottle alias
    scopes TSI's allowlist to this bottle's published ports so the
    agent can't reach other bottles' or host services' ports on
    loopback. No-op on Linux."""
    _loopback.ensure_pool()
    loopback_ip = _loopback.allocate(plan.slug)
        # 2. Per-bottle docker bridge.
    network = _bundle.bundle_network_name(plan.slug)
    _bundle.create_bundle_network(network, plan.bundle_subnet, plan.bundle_gateway)
    stack.callback(_bundle.remove_bundle_network, network)
    return loopback_ip, network
-        # 2. Mint per-bottle CAs and update the inner Plans with
+
-        # their launch-time paths. pipelock always runs in the
+def _mint_certs(plan: SmolmachinesBottlePlan) -> SmolmachinesBottlePlan:
-        # bundle; egress's CA is only minted when the bottle
+    """Mint per-bottle CAs and return the plan with CA paths filled.
-        # declares routes (otherwise egress runs idle without
+
-        # MITM and the CA files would be unused).
+    Pipelock always runs in the bundle. Egress's CA is only minted
    when the bottle declares routes — otherwise egress runs idle
    without MITM and the CA files would be unused."""
    ca_cert_host, ca_key_host = pipelock_tls_init(plan.proxy_plan.yaml_path.parent)
    proxy_plan = dataclasses.replace(
        plan.proxy_plan,
@@ -129,41 +163,47 @@ def launch(
            mitmproxy_ca_host_path=egress_ca_host,
            mitmproxy_ca_cert_only_host_path=egress_ca_cert_only,
            pipelock_ca_host_path=ca_cert_host,
-                # On smolmachines, egress's upstream is pipelock
+            # On smolmachines, egress's upstream is pipelock on the
-                # on the bundle's localhost — they're in the same
+            # bundle's localhost — they're in the same container's
-                # container's network namespace.
+            # network namespace.
            pipelock_proxy_url=BUNDLE_LOCAL_PIPELOCK_URL,
        )
-        plan = dataclasses.replace(
+    return dataclasses.replace(plan, proxy_plan=proxy_plan, egress_plan=egress_plan)
            plan, proxy_plan=proxy_plan, egress_plan=egress_plan,
        )
-        # 3. Build the BundleLaunchSpec from the (now-resolved)
+
-        # inner Plans: daemon subset, env, bind-mounts, and the
+def _start_bundle(
-        # loopback alias to bind published ports against. The
+    plan: SmolmachinesBottlePlan,
-        # spec's ports_to_publish list expands depending on which
+    network: str,
-        # daemons the agent needs to reach from the smolvm guest.
+    loopback_ip: str,
    stack: ExitStack,
 ) -> SmolmachinesBottlePlan:
    """Build the BundleLaunchSpec, resolve token env, start the
    sidecar bundle container, and register teardown."""
    bundle_spec = _bundle_launch_spec(plan, network, loopback_ip)
    token_env = _resolve_token_env(plan, dict(os.environ))
    _bundle.ensure_bundle_image(bundle_spec.image)
    _bundle.start_bundle(bundle_spec, env={**os.environ, **token_env})
    stack.callback(_bundle.stop_bundle, plan.slug)
    return plan
-        # 4. Discover the host-side ports docker assigned for the
+
-        # bundle's published container ports, and bind the
+def _discover_urls(
-        # agent's URLs to `<loopback_ip>:<host port>`. Docker
+    plan: SmolmachinesBottlePlan,
-        # container IPs (192.168.x.x in the daemon's bridge)
+    loopback_ip: str,
-        # aren't reachable from the smolvm guest on macOS — TSI
+) -> SmolmachinesBottlePlan:
-        # uses macOS networking, and macOS sees the daemon's
+    """Discover host-side ports for published container ports and
-        # bridge via the published-port loopback forward only.
+    return the plan with URLs + guest_env stamped in.
-        #
+
-        # Proxy hop order matches the docker backend: when the
+    Docker container IPs (192.168.x.x in the daemon's bridge)
-        # bottle declares egress routes, the agent's first hop is
+    aren't reachable from the smolvm guest on macOS — TSI uses
-        # egress (for token injection), then pipelock. Without
+    macOS networking, and macOS sees the daemon's bridge via the
-        # routes, the agent dials pipelock directly. Whichever
+    published-port loopback forward only.
-        # one is "agent-facing" is the daemon whose port we
+
-        # publish on host loopback; the other stays bundle-
+    Proxy hop order: when the bottle declares egress routes, the
-        # internal as the upstream proxy.
+    agent's first hop is egress (for token injection), then
    pipelock. Without routes, the agent dials pipelock directly.
    NO_PROXY includes the per-bottle loopback alias so the
    supervise + git-gate URLs bypass HTTPS_PROXY."""
    if plan.egress_plan.routes:
        agent_facing_port = _EGRESS_PORT
    else:
@@ -172,12 +212,14 @@ def launch(
        plan.slug, agent_facing_port, host_ip=loopback_ip,
    )
    agent_proxy_url = f"http://{loopback_ip}:{agent_facing_host_port}"
    agent_git_gate_host = ""
    if plan.git_gate_plan.upstreams:
        git_gate_host_port = _bundle.bundle_host_port(
            plan.slug, _GIT_HTTP_PORT, host_ip=loopback_ip,
        )
        agent_git_gate_host = f"{loopback_ip}:{git_gate_host_port}"
    agent_supervise_url = ""
    if plan.supervise_plan is not None:
        supervise_host_port = _bundle.bundle_host_port(
@@ -185,20 +227,6 @@ def launch(
        )
        agent_supervise_url = f"http://{loopback_ip}:{supervise_host_port}/"
        # Stamp the URLs onto the plan + guest_env. provision_git
        # and provision_supervise read the plan fields; the agent
        # reads guest_env on every exec_agent.
        #
        # NO_PROXY has to include the per-bottle loopback alias —
        # otherwise claude's HTTPS_PROXY catches direct calls to
        # the supervise URL (`http://<alias>:<port>/`) and proxies
        # them through egress, which has no route for the alias
        # and rejects with "Failed to connect". The smolmachines
        # git-gate URL uses smart HTTP, so it also has to bypass
        # the agent's HTTP_PROXY and go straight to the host-
        # published git HTTP endpoint. Append rather than overwrite
        # so prepare.py's
        # `localhost,127.0.0.1` baseline stays in place.
    existing_no_proxy = plan.guest_env.get("NO_PROXY", "localhost,127.0.0.1")
    guest_env = {
        **plan.guest_env,
@@ -210,7 +238,8 @@ def launch(
        guest_env["GIT_GATE_URL"] = f"http://{agent_git_gate_host}"
    if agent_supervise_url:
        guest_env["MCP_SUPERVISE_URL"] = agent_supervise_url
-        plan = dataclasses.replace(
+
    return dataclasses.replace(
        plan,
        guest_env=guest_env,
        agent_proxy_url=agent_proxy_url,
@@ -218,25 +247,20 @@ def launch(
        agent_supervise_url=agent_supervise_url,
    )
        # 5. Build the agent image and pack it into a
        # `.smolmachine` artifact (or hit the per-Dockerfile-digest
        # cache). Runs here, not in prepare, so the docker-build
        # output doesn't garble the dashboard's preflight modal:
        # both the curses-endwin path and the tmux pane-routing
        # path redirect stderr around `launch` already.
        agent_from_path = _ensure_smolmachine(
            plan.agent_image_ref,
            dockerfile=plan.agent_dockerfile_path,
        )
-        # smolvm VM. --from carries the pre-packed .smolmachine
+def _launch_vm(
-        # artifact; --allow-cidr + -e carry the per-bottle TSI
+    plan: SmolmachinesBottlePlan,
-        # allowlist + env. The allowlist is the per-bottle
+    agent_from_path: Path,
-        # loopback alias — narrowing it to one /32 keeps the
+    loopback_ip: str,
-        # agent from reaching other host loopback services or
+    stack: ExitStack,
-        # other bottles' published ports. Smolfile isn't usable
+) -> None:
-        # here — smolvm 0.8.0 makes `--from` and `--smolfile`
+    """Create, patch, and start the smolvm VM; register teardown.
-        # mutually exclusive.
+
    --allow-cidr is the per-bottle loopback alias so the guest can
    only reach this bottle's bundle ports. force_allowlist patches
    smolvm 0.8.0's silent-drop of --allow-cidr when combined with
    --from. Smolfile isn't usable here — smolvm 0.8.0 makes --from
    and --smolfile mutually exclusive."""
    _smolvm.machine_create(
        plan.machine_name,
        from_path=agent_from_path,
@@ -244,61 +268,33 @@ def launch(
        env=plan.guest_env,
    )
    stack.callback(_smolvm.machine_delete, plan.machine_name)
-        # Workaround smolvm 0.8.0: `--allow-cidr` is silently
+    # Workaround smolvm 0.8.0: `--allow-cidr` is silently dropped
-        # dropped when combined with `--from`. Patch the persisted
+    # when combined with `--from`. Patch the persisted state DB
-        # state DB to set the allowlist before start so the booted
+    # before start so the booted VM's TSI actually enforces.
        # VM's TSI actually enforces. See loopback_alias's module
        # docstring for the investigation that led here.
    _loopback.force_allowlist(plan.machine_name, [f"{loopback_ip}/32"])
    _smolvm.machine_start(plan.machine_name)
    stack.callback(_smolvm.machine_stop, plan.machine_name)
-        # 6. Repair filesystem ownership + perms that smolvm's
+
-        # pack process remapped to the host invoker's uid (501
+def _init_vm(plan: SmolmachinesBottlePlan) -> None:
-        # on macOS) rather than preserving the image's expected
+    """Repair filesystem ownership and wait for exec channel readiness.
-        # ownership.
+
-        #
+    Ownership repair: smolvm's pack process remaps files to the host
-        #  - /home/node → node:node so the node user can write
+    invoker's uid (501 on macOS). /home/node must be node:node so
-        #    its own dotfiles (claude appendFileSync on
+    Claude Code can write ~/.claude.json; /tmp + /var/tmp need root
-        #    ~/.claude.json otherwise bails with ENOENT/EPERM
+    mode 1777 so non-root processes can create per-uid scratch dirs.
-        #    and the TUI hangs without surfacing the error).
+    All folded into one sh -c to avoid back-to-back exec calls
-        #  - /tmp + /var/tmp → root:root mode 1777 so non-root
+    immediately after machine_start (libkrun exec-channel race).
-        #    processes can create their per-uid scratch dirs
+
-        #    (claude-code creates /tmp/claude-<uid>/ as soon as
+    wait_exec_ready polls until the exec channel is ready for the
-        #    it spawns a Bash tool call).
+    subsequent provision calls, replacing the empirical sleep."""
        #
        # All folded into one sh -c so we only pay one
        # machine_exec round trip — back-to-back exec calls
        # right after machine_start hit a SIGKILL race in
        # libkrun's exec channel (see provision_ca for the
        # other half of this same workaround).
    _smolvm.machine_exec(plan.machine_name, [
        "sh", "-c",
        "chown -R node:node /home/node && "
        "chown root:root /tmp /var/tmp && "
        "chmod 1777 /tmp /var/tmp",
    ])
-
+    _smolvm.wait_exec_ready(plan.machine_name)
        # Wait briefly for the VM to settle. Back-to-back smolvm
        # machine_exec calls immediately after machine_start
        # occasionally SIGKILL the in-VM child at ~100ms (looks
        # like a VM warm-up race in libkrun's exec channel).
        # 1.5s is empirically enough to dodge it; provisioning
        # already takes seconds so the wait is amortized.
        time.sleep(1.5)
        # 7. Provision (CA / prompt / skills / git / supervise).
        prompt_path = provision(plan, plan.machine_name)
        yield SmolmachinesBottle(
            plan.machine_name,
            prompt_path=prompt_path,
            guest_env=plan.guest_env,
            agent_command=plan.agent_command,
            agent_prompt_mode=plan.agent_prompt_mode,
        )
    finally:
        stack.close()
 def _bundle_launch_spec(
@@ -324,10 +320,9 @@ def _bundle_launch_spec(
    # is "agent-facing" gets its port published on the host
    # loopback (see `_ensure_smolmachine`'s discovery loop) and the
    # other stays bundle-internal. The bundle is NOT reachable by
-    # bridge IP from the smolvm guest, so the
+    # bridge IP from the smolvm guest on macOS — TSI uses macOS
-    # PRD-0023-chunk-3 EGRESS_LISTEN_HOST=127.0.0.1 mitigation
+    # networking, and macOS sees the daemon's bridge via the
-    # isn't needed: the agent can only dial whatever daemon's
+    # published-port loopback forward only.
    # host port we publish, period.
    # --- pipelock ---------------------------------------------
    pp = plan.proxy_plan
@@ -45,6 +45,7 @@ alias gets handed to a new bottle."""
 from __future__ import annotations
 import fcntl
 import json
 import os
 import platform
@@ -83,6 +84,14 @@ _POOL_START = 16
 _POOL_END = 31  # inclusive
 # File lock that serialises concurrent allocate() calls so two
 # simultaneous launches can't read the same docker state and claim
 # the same alias. Narrowed to the allocate() call itself; docker run
 # runs after the lock is released. Once the container is running it
 # appears in docker state and future allocate() calls will see it.
 _ALLOC_LOCK_PATH = Path.home() / ".cache" / "bot-bottle" / "smolmachines.lock"
 # Loopback aliases pool: 127.0.0.<start>..127.0.0.<end>.
 def _pool_addresses() -> list[str]:
    return [f"127.0.0.{i}" for i in range(_POOL_START, _POOL_END + 1)]
@@ -179,9 +188,20 @@ def allocate(slug: str) -> str:
    On non-macOS the whole `127.0.0.0/8` is loopback by default;
    `127.0.0.1` is fine to share and we skip the alias dance.
    This still returns a deterministic address so launch.py's
-    callers don't have to branch on platform."""
+    callers don't have to branch on platform.
    An exclusive file lock serialises concurrent calls so two
    simultaneous launches don't read the same docker state and
    claim the same alias."""
    if not _is_macos():
        return "127.0.0.1"
    _ALLOC_LOCK_PATH.parent.mkdir(parents=True, exist_ok=True)
    with open(_ALLOC_LOCK_PATH, "w") as lf:
        fcntl.flock(lf, fcntl.LOCK_EX)
        return _allocate_locked()
 def _allocate_locked() -> str:
    in_use = _aliases_in_use()
    for ip in _pool_addresses():
        if ip not in in_use:
@@ -27,10 +27,13 @@ from __future__ import annotations
 import shutil
 import subprocess
 import time
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Mapping, Sequence
 from ...log import die
 _SMOLVM = "smolvm"
@@ -197,6 +200,30 @@ def machine_exec(
    )
 def wait_exec_ready(name: str, *, timeout: float = 5.0) -> None:
    """Poll `machine exec true` until exit 0 or `timeout` elapses.
    Replaces `time.sleep(1.5)` after `machine_start`: libkrun's exec
    channel needs a brief warm-up before back-to-back exec calls are
    safe. Polling exits as soon as the channel is ready and fails
    loudly if the VM never responds."""
    deadline = time.monotonic() + timeout
    delay = 0.1
    while time.monotonic() < deadline:
        r = machine_exec(name, ["true"])
        if r.returncode == 0:
            return
        remaining = deadline - time.monotonic()
        if remaining <= 0:
            break
        time.sleep(min(delay, remaining))
        delay = min(delay * 2, 0.5)
    die(
        f"smolvm machine {name!r}: exec channel not ready after "
        f"{timeout:.0f}s — VM may have failed to boot."
    )
 def machine_cp(src: str, dst: str) -> None:
    """`smolvm machine cp SRC DST`. Path syntax: `machine:path` to
    reference a path inside the VM, bare path for the host. Both
@@ -11,6 +11,7 @@ import json
 import sqlite3
 import subprocess
 import tempfile
 import threading
 import unittest
 from pathlib import Path
 from unittest.mock import patch
@@ -144,6 +145,55 @@ class TestAllocate(unittest.TestCase):
                loopback_alias.allocate("demo-overflow")
 class TestAllocateLock(unittest.TestCase):
    """allocate() on macOS acquires a file lock so concurrent calls
    serialise rather than racing on docker state."""
    def test_acquires_exclusive_lock_on_macos(self):
        import fcntl as fcntl_mod
        flock_calls: list[int] = []
        def record_flock(fd, op):
            flock_calls.append(op)
        with tempfile.TemporaryDirectory() as tmp:
            lock_path = Path(tmp) / "smolmachines.lock"
            with patch.object(loopback_alias, "_is_macos", return_value=True), \
                 patch.object(loopback_alias, "_ALLOC_LOCK_PATH", lock_path), \
                 patch.object(loopback_alias, "_aliases_in_use", return_value=set()), \
                 patch.object(loopback_alias.fcntl, "flock",
                              side_effect=record_flock):
                loopback_alias.allocate("demo")
        self.assertIn(fcntl_mod.LOCK_EX, flock_calls)
    def test_no_lock_on_linux(self):
        # Linux early-returns before touching the lock file.
        with patch.object(loopback_alias, "_is_macos", return_value=False), \
             patch.object(loopback_alias.fcntl, "flock") as flock:
            loopback_alias.allocate("demo")
        flock.assert_not_called()
    def test_sequential_allocations_with_shared_lock_are_serialised(self):
        # Two sequential calls share the same lock file. The second
        # call sees {127.0.0.16} in use (as if the first caller's
        # docker run completed between the two lock acquisitions) and
        # returns the next alias.
        in_use_seq = [set(), {"127.0.0.16"}]
        with tempfile.TemporaryDirectory() as tmp:
            lock_path = Path(tmp) / "smolmachines.lock"
            results: list[str] = []
            for _ in range(2):
                with patch.object(loopback_alias, "_is_macos", return_value=True), \
                     patch.object(loopback_alias, "_ALLOC_LOCK_PATH", lock_path), \
                     patch.object(loopback_alias, "_aliases_in_use",
                                  return_value=in_use_seq.pop(0)):
                    results.append(loopback_alias.allocate("demo"))
        self.assertEqual(["127.0.0.16", "127.0.0.17"], results)
 class TestAliasInUseDetection(unittest.TestCase):
    """`_aliases_in_use` inspects every running bundle and pulls
    each container's port-binding `HostIp` out. The detection has
@@ -12,6 +12,7 @@ import unittest
 from pathlib import Path
 from unittest.mock import patch
 from bot_bottle.backend.smolmachines import smolvm as smolvm_mod
 from bot_bottle.backend.smolmachines.smolvm import (
    SmolvmError,
    SmolvmRunResult,
@@ -23,6 +24,7 @@ from bot_bottle.backend.smolmachines.smolvm import (
    machine_start,
    machine_stop,
    pack_create,
    wait_exec_ready,
 )
@@ -204,6 +206,45 @@ class TestErrorPath(unittest.TestCase):
        self.assertEqual(SmolvmRunResult(42, "", "nope"), r)
 class TestWaitExecReady(unittest.TestCase):
    """wait_exec_ready polls machine_exec(name, ["true"]) until it
    returns 0, then exits. On timeout it calls die()."""
    def test_returns_immediately_when_exec_succeeds_first_try(self):
        with patch.object(smolvm_mod, "machine_exec",
                          return_value=SmolvmRunResult(0, "", "")) as m:
            wait_exec_ready("vm-x")
        m.assert_called_once_with("vm-x", ["true"])
    def test_retries_on_nonzero_and_returns_on_success(self):
        results = [
            SmolvmRunResult(1, "", "not ready"),
            SmolvmRunResult(1, "", "not ready"),
            SmolvmRunResult(0, "", ""),
        ]
        with patch.object(smolvm_mod, "machine_exec",
                          side_effect=results) as m, \
             patch.object(smolvm_mod.time, "sleep"):
            wait_exec_ready("vm-x")
        self.assertEqual(3, m.call_count)
    def test_dies_on_timeout(self):
        # machine_exec always returns non-zero; monotonic advances past
        # the deadline after the first sleep so the loop exits.
        ticks = [0.0, 0.0, 10.0]  # third call puts us past deadline
        with patch.object(smolvm_mod, "machine_exec",
                          return_value=SmolvmRunResult(1, "", "")), \
             patch.object(smolvm_mod.time, "monotonic",
                          side_effect=ticks), \
             patch.object(smolvm_mod.time, "sleep"), \
             patch.object(smolvm_mod, "die",
                          side_effect=SystemExit("die")) as die_mock:
            with self.assertRaises(SystemExit):
                wait_exec_ready("vm-x", timeout=5.0)
        die_mock.assert_called_once()
        self.assertIn("vm-x", die_mock.call_args.args[0])
 class TestIsAvailable(unittest.TestCase):
    def test_true_when_on_path(self):
        with patch(