PRD 0032: Decompose smolmachines launch and harden bringup sequencing #123
@@ -21,7 +21,6 @@ from __future__ import annotations
|
||||
|
||||
import dataclasses
|
||||
import os
|
||||
import time
|
||||
from contextlib import ExitStack, contextmanager
|
||||
from pathlib import Path
|
||||
from typing import Callable, Generator
|
||||
@@ -94,200 +93,23 @@ def launch(
|
||||
via the ExitStack."""
|
||||
stack = ExitStack()
|
||||
try:
|
||||
# 1. Reserve a loopback alias for this bottle. macOS only
|
||||
# routes 127.0.0.1 by default; the per-bottle alias is
|
||||
# what bundles the docker port-publishes and TSI allowlist
|
||||
# against, so this bottle can't reach other bottles' (or
|
||||
# other host services') ports on the loopback. Lazy
|
||||
# sudo-driven on first use per boot. No-op on Linux.
|
||||
_loopback.ensure_pool()
|
||||
loopback_ip = _loopback.allocate(plan.slug)
|
||||
loopback_ip, network = _allocate_resources(plan, stack)
|
||||
plan = _mint_certs(plan)
|
||||
plan = _start_bundle(plan, network, loopback_ip, stack)
|
||||
plan = _discover_urls(plan, loopback_ip)
|
||||
|
||||
# 2. Per-bottle docker bridge.
|
||||
network = _bundle.bundle_network_name(plan.slug)
|
||||
_bundle.create_bundle_network(network, plan.bundle_subnet, plan.bundle_gateway)
|
||||
stack.callback(_bundle.remove_bundle_network, network)
|
||||
|
||||
# 2. Mint per-bottle CAs and update the inner Plans with
|
||||
# their launch-time paths. pipelock always runs in the
|
||||
# bundle; egress's CA is only minted when the bottle
|
||||
# declares routes (otherwise egress runs idle without
|
||||
# MITM and the CA files would be unused).
|
||||
ca_cert_host, ca_key_host = pipelock_tls_init(plan.proxy_plan.yaml_path.parent)
|
||||
proxy_plan = dataclasses.replace(
|
||||
plan.proxy_plan,
|
||||
ca_cert_host_path=ca_cert_host,
|
||||
ca_key_host_path=ca_key_host,
|
||||
)
|
||||
egress_plan = plan.egress_plan
|
||||
if egress_plan.routes:
|
||||
egress_ca_host, egress_ca_cert_only = egress_tls_init(
|
||||
plan.egress_plan.routes_path.parent,
|
||||
)
|
||||
egress_plan = dataclasses.replace(
|
||||
egress_plan,
|
||||
mitmproxy_ca_host_path=egress_ca_host,
|
||||
mitmproxy_ca_cert_only_host_path=egress_ca_cert_only,
|
||||
pipelock_ca_host_path=ca_cert_host,
|
||||
# On smolmachines, egress's upstream is pipelock
|
||||
# on the bundle's localhost — they're in the same
|
||||
# container's network namespace.
|
||||
pipelock_proxy_url=BUNDLE_LOCAL_PIPELOCK_URL,
|
||||
)
|
||||
plan = dataclasses.replace(
|
||||
plan, proxy_plan=proxy_plan, egress_plan=egress_plan,
|
||||
)
|
||||
|
||||
# 3. Build the BundleLaunchSpec from the (now-resolved)
|
||||
# inner Plans: daemon subset, env, bind-mounts, and the
|
||||
# loopback alias to bind published ports against. The
|
||||
# spec's ports_to_publish list expands depending on which
|
||||
# daemons the agent needs to reach from the smolvm guest.
|
||||
bundle_spec = _bundle_launch_spec(plan, network, loopback_ip)
|
||||
token_env = _resolve_token_env(plan, dict(os.environ))
|
||||
_bundle.ensure_bundle_image(bundle_spec.image)
|
||||
_bundle.start_bundle(bundle_spec, env={**os.environ, **token_env})
|
||||
stack.callback(_bundle.stop_bundle, plan.slug)
|
||||
|
||||
# 4. Discover the host-side ports docker assigned for the
|
||||
# bundle's published container ports, and bind the
|
||||
# agent's URLs to `<loopback_ip>:<host port>`. Docker
|
||||
# container IPs (192.168.x.x in the daemon's bridge)
|
||||
# aren't reachable from the smolvm guest on macOS — TSI
|
||||
# uses macOS networking, and macOS sees the daemon's
|
||||
# bridge via the published-port loopback forward only.
|
||||
#
|
||||
# Proxy hop order matches the docker backend: when the
|
||||
# bottle declares egress routes, the agent's first hop is
|
||||
# egress (for token injection), then pipelock. Without
|
||||
# routes, the agent dials pipelock directly. Whichever
|
||||
# one is "agent-facing" is the daemon whose port we
|
||||
# publish on host loopback; the other stays bundle-
|
||||
# internal as the upstream proxy.
|
||||
if plan.egress_plan.routes:
|
||||
agent_facing_port = _EGRESS_PORT
|
||||
else:
|
||||
agent_facing_port = _PIPELOCK_PORT
|
||||
agent_facing_host_port = _bundle.bundle_host_port(
|
||||
plan.slug, agent_facing_port, host_ip=loopback_ip,
|
||||
)
|
||||
agent_proxy_url = f"http://{loopback_ip}:{agent_facing_host_port}"
|
||||
agent_git_gate_host = ""
|
||||
if plan.git_gate_plan.upstreams:
|
||||
git_gate_host_port = _bundle.bundle_host_port(
|
||||
plan.slug, _GIT_HTTP_PORT, host_ip=loopback_ip,
|
||||
)
|
||||
agent_git_gate_host = f"{loopback_ip}:{git_gate_host_port}"
|
||||
agent_supervise_url = ""
|
||||
if plan.supervise_plan is not None:
|
||||
supervise_host_port = _bundle.bundle_host_port(
|
||||
plan.slug, _SUPERVISE_PORT, host_ip=loopback_ip,
|
||||
)
|
||||
agent_supervise_url = f"http://{loopback_ip}:{supervise_host_port}/"
|
||||
|
||||
# Stamp the URLs onto the plan + guest_env. provision_git
|
||||
# and provision_supervise read the plan fields; the agent
|
||||
# reads guest_env on every exec_agent.
|
||||
#
|
||||
# NO_PROXY has to include the per-bottle loopback alias —
|
||||
# otherwise claude's HTTPS_PROXY catches direct calls to
|
||||
# the supervise URL (`http://<alias>:<port>/`) and proxies
|
||||
# them through egress, which has no route for the alias
|
||||
# and rejects with "Failed to connect". The smolmachines
|
||||
# git-gate URL uses smart HTTP, so it also has to bypass
|
||||
# the agent's HTTP_PROXY and go straight to the host-
|
||||
# published git HTTP endpoint. Append rather than overwrite
|
||||
# so prepare.py's
|
||||
# `localhost,127.0.0.1` baseline stays in place.
|
||||
existing_no_proxy = plan.guest_env.get("NO_PROXY", "localhost,127.0.0.1")
|
||||
guest_env = {
|
||||
**plan.guest_env,
|
||||
"HTTPS_PROXY": agent_proxy_url,
|
||||
"HTTP_PROXY": agent_proxy_url,
|
||||
"NO_PROXY": f"{existing_no_proxy},{loopback_ip}",
|
||||
}
|
||||
if agent_git_gate_host:
|
||||
guest_env["GIT_GATE_URL"] = f"http://{agent_git_gate_host}"
|
||||
if agent_supervise_url:
|
||||
guest_env["MCP_SUPERVISE_URL"] = agent_supervise_url
|
||||
plan = dataclasses.replace(
|
||||
plan,
|
||||
guest_env=guest_env,
|
||||
agent_proxy_url=agent_proxy_url,
|
||||
agent_git_gate_host=agent_git_gate_host,
|
||||
agent_supervise_url=agent_supervise_url,
|
||||
)
|
||||
|
||||
# 5. Build the agent image and pack it into a
|
||||
# `.smolmachine` artifact (or hit the per-Dockerfile-digest
|
||||
# cache). Runs here, not in prepare, so the docker-build
|
||||
# output doesn't garble the dashboard's preflight modal:
|
||||
# both the curses-endwin path and the tmux pane-routing
|
||||
# path redirect stderr around `launch` already.
|
||||
# Build the agent image and pack it into a `.smolmachine`
|
||||
# artifact (or hit the per-Dockerfile-digest cache). Runs
|
||||
# here, not in prepare, so the docker-build output doesn't
|
||||
# garble the dashboard's preflight modal.
|
||||
agent_from_path = _ensure_smolmachine(
|
||||
plan.agent_image_ref,
|
||||
dockerfile=plan.agent_dockerfile_path,
|
||||
)
|
||||
|
||||
# smolvm VM. --from carries the pre-packed .smolmachine
|
||||
# artifact; --allow-cidr + -e carry the per-bottle TSI
|
||||
# allowlist + env. The allowlist is the per-bottle
|
||||
# loopback alias — narrowing it to one /32 keeps the
|
||||
# agent from reaching other host loopback services or
|
||||
# other bottles' published ports. Smolfile isn't usable
|
||||
# here — smolvm 0.8.0 makes `--from` and `--smolfile`
|
||||
# mutually exclusive.
|
||||
_smolvm.machine_create(
|
||||
plan.machine_name,
|
||||
from_path=agent_from_path,
|
||||
allow_cidrs=[f"{loopback_ip}/32"],
|
||||
env=plan.guest_env,
|
||||
)
|
||||
stack.callback(_smolvm.machine_delete, plan.machine_name)
|
||||
# Workaround smolvm 0.8.0: `--allow-cidr` is silently
|
||||
# dropped when combined with `--from`. Patch the persisted
|
||||
# state DB to set the allowlist before start so the booted
|
||||
# VM's TSI actually enforces. See loopback_alias's module
|
||||
# docstring for the investigation that led here.
|
||||
_loopback.force_allowlist(plan.machine_name, [f"{loopback_ip}/32"])
|
||||
_smolvm.machine_start(plan.machine_name)
|
||||
stack.callback(_smolvm.machine_stop, plan.machine_name)
|
||||
_launch_vm(plan, agent_from_path, loopback_ip, stack)
|
||||
_init_vm(plan)
|
||||
|
||||
# 6. Repair filesystem ownership + perms that smolvm's
|
||||
# pack process remapped to the host invoker's uid (501
|
||||
# on macOS) rather than preserving the image's expected
|
||||
# ownership.
|
||||
#
|
||||
# - /home/node → node:node so the node user can write
|
||||
# its own dotfiles (claude appendFileSync on
|
||||
# ~/.claude.json otherwise bails with ENOENT/EPERM
|
||||
# and the TUI hangs without surfacing the error).
|
||||
# - /tmp + /var/tmp → root:root mode 1777 so non-root
|
||||
# processes can create their per-uid scratch dirs
|
||||
# (claude-code creates /tmp/claude-<uid>/ as soon as
|
||||
# it spawns a Bash tool call).
|
||||
#
|
||||
# All folded into one sh -c so we only pay one
|
||||
# machine_exec round trip — back-to-back exec calls
|
||||
# right after machine_start hit a SIGKILL race in
|
||||
# libkrun's exec channel (see provision_ca for the
|
||||
# other half of this same workaround).
|
||||
_smolvm.machine_exec(plan.machine_name, [
|
||||
"sh", "-c",
|
||||
"chown -R node:node /home/node && "
|
||||
"chown root:root /tmp /var/tmp && "
|
||||
"chmod 1777 /tmp /var/tmp",
|
||||
])
|
||||
|
||||
# Wait briefly for the VM to settle. Back-to-back smolvm
|
||||
# machine_exec calls immediately after machine_start
|
||||
# occasionally SIGKILL the in-VM child at ~100ms (looks
|
||||
# like a VM warm-up race in libkrun's exec channel).
|
||||
# 1.5s is empirically enough to dodge it; provisioning
|
||||
# already takes seconds so the wait is amortized.
|
||||
time.sleep(1.5)
|
||||
|
||||
# 7. Provision (CA / prompt / skills / git / supervise).
|
||||
prompt_path = provision(plan, plan.machine_name)
|
||||
|
||||
yield SmolmachinesBottle(
|
||||
@@ -301,6 +123,180 @@ def launch(
|
||||
stack.close()
|
||||
|
||||
|
||||
def _allocate_resources(
|
||||
plan: SmolmachinesBottlePlan,
|
||||
stack: ExitStack,
|
||||
) -> tuple[str, str]:
|
||||
"""Reserve a loopback alias and create the per-bottle docker bridge.
|
||||
|
||||
macOS only routes 127.0.0.1 by default; the per-bottle alias
|
||||
scopes TSI's allowlist to this bottle's published ports so the
|
||||
agent can't reach other bottles' or host services' ports on
|
||||
loopback. No-op on Linux."""
|
||||
_loopback.ensure_pool()
|
||||
loopback_ip = _loopback.allocate(plan.slug)
|
||||
network = _bundle.bundle_network_name(plan.slug)
|
||||
_bundle.create_bundle_network(network, plan.bundle_subnet, plan.bundle_gateway)
|
||||
stack.callback(_bundle.remove_bundle_network, network)
|
||||
return loopback_ip, network
|
||||
|
||||
|
||||
def _mint_certs(plan: SmolmachinesBottlePlan) -> SmolmachinesBottlePlan:
|
||||
"""Mint per-bottle CAs and return the plan with CA paths filled.
|
||||
|
||||
Pipelock always runs in the bundle. Egress's CA is only minted
|
||||
when the bottle declares routes — otherwise egress runs idle
|
||||
without MITM and the CA files would be unused."""
|
||||
ca_cert_host, ca_key_host = pipelock_tls_init(plan.proxy_plan.yaml_path.parent)
|
||||
proxy_plan = dataclasses.replace(
|
||||
plan.proxy_plan,
|
||||
ca_cert_host_path=ca_cert_host,
|
||||
ca_key_host_path=ca_key_host,
|
||||
)
|
||||
egress_plan = plan.egress_plan
|
||||
if egress_plan.routes:
|
||||
egress_ca_host, egress_ca_cert_only = egress_tls_init(
|
||||
plan.egress_plan.routes_path.parent,
|
||||
)
|
||||
egress_plan = dataclasses.replace(
|
||||
egress_plan,
|
||||
mitmproxy_ca_host_path=egress_ca_host,
|
||||
mitmproxy_ca_cert_only_host_path=egress_ca_cert_only,
|
||||
pipelock_ca_host_path=ca_cert_host,
|
||||
# On smolmachines, egress's upstream is pipelock on the
|
||||
# bundle's localhost — they're in the same container's
|
||||
# network namespace.
|
||||
pipelock_proxy_url=BUNDLE_LOCAL_PIPELOCK_URL,
|
||||
)
|
||||
return dataclasses.replace(plan, proxy_plan=proxy_plan, egress_plan=egress_plan)
|
||||
|
||||
|
||||
def _start_bundle(
|
||||
plan: SmolmachinesBottlePlan,
|
||||
network: str,
|
||||
loopback_ip: str,
|
||||
stack: ExitStack,
|
||||
) -> SmolmachinesBottlePlan:
|
||||
"""Build the BundleLaunchSpec, resolve token env, start the
|
||||
sidecar bundle container, and register teardown."""
|
||||
bundle_spec = _bundle_launch_spec(plan, network, loopback_ip)
|
||||
token_env = _resolve_token_env(plan, dict(os.environ))
|
||||
_bundle.ensure_bundle_image(bundle_spec.image)
|
||||
_bundle.start_bundle(bundle_spec, env={**os.environ, **token_env})
|
||||
stack.callback(_bundle.stop_bundle, plan.slug)
|
||||
return plan
|
||||
|
||||
|
||||
def _discover_urls(
|
||||
plan: SmolmachinesBottlePlan,
|
||||
loopback_ip: str,
|
||||
) -> SmolmachinesBottlePlan:
|
||||
"""Discover host-side ports for published container ports and
|
||||
return the plan with URLs + guest_env stamped in.
|
||||
|
||||
Docker container IPs (192.168.x.x in the daemon's bridge)
|
||||
aren't reachable from the smolvm guest on macOS — TSI uses
|
||||
macOS networking, and macOS sees the daemon's bridge via the
|
||||
published-port loopback forward only.
|
||||
|
||||
Proxy hop order: when the bottle declares egress routes, the
|
||||
agent's first hop is egress (for token injection), then
|
||||
pipelock. Without routes, the agent dials pipelock directly.
|
||||
NO_PROXY includes the per-bottle loopback alias so the
|
||||
supervise + git-gate URLs bypass HTTPS_PROXY."""
|
||||
if plan.egress_plan.routes:
|
||||
agent_facing_port = _EGRESS_PORT
|
||||
else:
|
||||
agent_facing_port = _PIPELOCK_PORT
|
||||
agent_facing_host_port = _bundle.bundle_host_port(
|
||||
plan.slug, agent_facing_port, host_ip=loopback_ip,
|
||||
)
|
||||
agent_proxy_url = f"http://{loopback_ip}:{agent_facing_host_port}"
|
||||
|
||||
agent_git_gate_host = ""
|
||||
if plan.git_gate_plan.upstreams:
|
||||
git_gate_host_port = _bundle.bundle_host_port(
|
||||
plan.slug, _GIT_HTTP_PORT, host_ip=loopback_ip,
|
||||
)
|
||||
agent_git_gate_host = f"{loopback_ip}:{git_gate_host_port}"
|
||||
|
||||
agent_supervise_url = ""
|
||||
if plan.supervise_plan is not None:
|
||||
supervise_host_port = _bundle.bundle_host_port(
|
||||
plan.slug, _SUPERVISE_PORT, host_ip=loopback_ip,
|
||||
)
|
||||
agent_supervise_url = f"http://{loopback_ip}:{supervise_host_port}/"
|
||||
|
||||
existing_no_proxy = plan.guest_env.get("NO_PROXY", "localhost,127.0.0.1")
|
||||
guest_env = {
|
||||
**plan.guest_env,
|
||||
"HTTPS_PROXY": agent_proxy_url,
|
||||
"HTTP_PROXY": agent_proxy_url,
|
||||
"NO_PROXY": f"{existing_no_proxy},{loopback_ip}",
|
||||
}
|
||||
if agent_git_gate_host:
|
||||
guest_env["GIT_GATE_URL"] = f"http://{agent_git_gate_host}"
|
||||
if agent_supervise_url:
|
||||
guest_env["MCP_SUPERVISE_URL"] = agent_supervise_url
|
||||
|
||||
return dataclasses.replace(
|
||||
plan,
|
||||
guest_env=guest_env,
|
||||
agent_proxy_url=agent_proxy_url,
|
||||
agent_git_gate_host=agent_git_gate_host,
|
||||
agent_supervise_url=agent_supervise_url,
|
||||
)
|
||||
|
||||
|
||||
def _launch_vm(
|
||||
plan: SmolmachinesBottlePlan,
|
||||
agent_from_path: Path,
|
||||
loopback_ip: str,
|
||||
stack: ExitStack,
|
||||
) -> None:
|
||||
"""Create, patch, and start the smolvm VM; register teardown.
|
||||
|
||||
--allow-cidr is the per-bottle loopback alias so the guest can
|
||||
only reach this bottle's bundle ports. force_allowlist patches
|
||||
smolvm 0.8.0's silent-drop of --allow-cidr when combined with
|
||||
--from. Smolfile isn't usable here — smolvm 0.8.0 makes --from
|
||||
and --smolfile mutually exclusive."""
|
||||
_smolvm.machine_create(
|
||||
plan.machine_name,
|
||||
from_path=agent_from_path,
|
||||
allow_cidrs=[f"{loopback_ip}/32"],
|
||||
env=plan.guest_env,
|
||||
)
|
||||
stack.callback(_smolvm.machine_delete, plan.machine_name)
|
||||
# Workaround smolvm 0.8.0: `--allow-cidr` is silently dropped
|
||||
# when combined with `--from`. Patch the persisted state DB
|
||||
# before start so the booted VM's TSI actually enforces.
|
||||
_loopback.force_allowlist(plan.machine_name, [f"{loopback_ip}/32"])
|
||||
_smolvm.machine_start(plan.machine_name)
|
||||
stack.callback(_smolvm.machine_stop, plan.machine_name)
|
||||
|
||||
|
||||
def _init_vm(plan: SmolmachinesBottlePlan) -> None:
|
||||
"""Repair filesystem ownership and wait for exec channel readiness.
|
||||
|
||||
Ownership repair: smolvm's pack process remaps files to the host
|
||||
invoker's uid (501 on macOS). /home/node must be node:node so
|
||||
Claude Code can write ~/.claude.json; /tmp + /var/tmp need root
|
||||
mode 1777 so non-root processes can create per-uid scratch dirs.
|
||||
All folded into one sh -c to avoid back-to-back exec calls
|
||||
immediately after machine_start (libkrun exec-channel race).
|
||||
|
||||
wait_exec_ready polls until the exec channel is ready for the
|
||||
subsequent provision calls, replacing the empirical sleep."""
|
||||
_smolvm.machine_exec(plan.machine_name, [
|
||||
"sh", "-c",
|
||||
"chown -R node:node /home/node && "
|
||||
"chown root:root /tmp /var/tmp && "
|
||||
"chmod 1777 /tmp /var/tmp",
|
||||
])
|
||||
_smolvm.wait_exec_ready(plan.machine_name)
|
||||
|
||||
|
||||
def _bundle_launch_spec(
|
||||
plan: SmolmachinesBottlePlan, network: str, loopback_ip: str,
|
||||
) -> _bundle.BundleLaunchSpec:
|
||||
@@ -324,10 +320,9 @@ def _bundle_launch_spec(
|
||||
# is "agent-facing" gets its port published on the host
|
||||
# loopback (see `_ensure_smolmachine`'s discovery loop) and the
|
||||
# other stays bundle-internal. The bundle is NOT reachable by
|
||||
# bridge IP from the smolvm guest, so the
|
||||
# PRD-0023-chunk-3 EGRESS_LISTEN_HOST=127.0.0.1 mitigation
|
||||
# isn't needed: the agent can only dial whatever daemon's
|
||||
# host port we publish, period.
|
||||
# bridge IP from the smolvm guest on macOS — TSI uses macOS
|
||||
# networking, and macOS sees the daemon's bridge via the
|
||||
# published-port loopback forward only.
|
||||
|
||||
# --- pipelock ---------------------------------------------
|
||||
pp = plan.proxy_plan
|
||||
|
||||
@@ -45,6 +45,7 @@ alias gets handed to a new bottle."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import fcntl
|
||||
import json
|
||||
import os
|
||||
import platform
|
||||
@@ -83,6 +84,14 @@ _POOL_START = 16
|
||||
_POOL_END = 31 # inclusive
|
||||
|
||||
|
||||
# File lock that serialises concurrent allocate() calls so two
|
||||
# simultaneous launches can't read the same docker state and claim
|
||||
# the same alias. Narrowed to the allocate() call itself; docker run
|
||||
# runs after the lock is released. Once the container is running it
|
||||
# appears in docker state and future allocate() calls will see it.
|
||||
_ALLOC_LOCK_PATH = Path.home() / ".cache" / "bot-bottle" / "smolmachines.lock"
|
||||
|
||||
|
||||
# Loopback aliases pool: 127.0.0.<start>..127.0.0.<end>.
|
||||
def _pool_addresses() -> list[str]:
|
||||
return [f"127.0.0.{i}" for i in range(_POOL_START, _POOL_END + 1)]
|
||||
@@ -179,9 +188,20 @@ def allocate(slug: str) -> str:
|
||||
On non-macOS the whole `127.0.0.0/8` is loopback by default;
|
||||
`127.0.0.1` is fine to share and we skip the alias dance.
|
||||
This still returns a deterministic address so launch.py's
|
||||
callers don't have to branch on platform."""
|
||||
callers don't have to branch on platform.
|
||||
|
||||
An exclusive file lock serialises concurrent calls so two
|
||||
simultaneous launches don't read the same docker state and
|
||||
claim the same alias."""
|
||||
if not _is_macos():
|
||||
return "127.0.0.1"
|
||||
_ALLOC_LOCK_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(_ALLOC_LOCK_PATH, "w") as lf:
|
||||
fcntl.flock(lf, fcntl.LOCK_EX)
|
||||
return _allocate_locked()
|
||||
|
||||
|
||||
def _allocate_locked() -> str:
|
||||
in_use = _aliases_in_use()
|
||||
for ip in _pool_addresses():
|
||||
if ip not in in_use:
|
||||
|
||||
@@ -27,11 +27,13 @@ from __future__ import annotations
|
||||
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Mapping, Sequence
|
||||
|
||||
|
||||
|
||||
_SMOLVM = "smolvm"
|
||||
|
||||
|
||||
@@ -197,6 +199,34 @@ def machine_exec(
|
||||
)
|
||||
|
||||
|
||||
def wait_exec_ready(name: str, *, timeout: float = 5.0) -> None:
|
||||
"""Poll `machine exec true` until exit 0 or `timeout` elapses.
|
||||
|
||||
Replaces `time.sleep(1.5)` after `machine_start`: libkrun's exec
|
||||
channel needs a brief warm-up before back-to-back exec calls are
|
||||
safe. Polling exits as soon as the channel is ready and fails
|
||||
loudly if the VM never responds."""
|
||||
deadline = time.monotonic() + timeout
|
||||
delay = 0.1
|
||||
while time.monotonic() < deadline:
|
||||
r = machine_exec(name, ["true"])
|
||||
if r.returncode == 0:
|
||||
return
|
||||
remaining = deadline - time.monotonic()
|
||||
if remaining <= 0:
|
||||
break
|
||||
time.sleep(min(delay, remaining))
|
||||
delay = min(delay * 2, 0.5)
|
||||
argv = ["smolvm", "machine", "exec", "--name", name, "--", "true"]
|
||||
raise SmolvmError(
|
||||
|
|
||||
argv,
|
||||
subprocess.CompletedProcess(
|
||||
args=argv, returncode=-1, stdout="",
|
||||
stderr=f"exec channel not ready after {timeout:.0f}s — VM may have failed to boot.",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def machine_cp(src: str, dst: str) -> None:
|
||||
"""`smolvm machine cp SRC DST`. Path syntax: `machine:path` to
|
||||
reference a path inside the VM, bare path for the host. Both
|
||||
|
||||
@@ -0,0 +1,221 @@
|
||||
# PRD 0032: Decompose smolmachines launch and harden bringup sequencing
|
||||
|
||||
- **Status:** Active
|
||||
- **Author:** didericis-claude
|
||||
- **Created:** 2026-06-02
|
||||
- **Issue:** #122
|
||||
|
||||
## Summary
|
||||
|
||||
Split `launch()` into named per-step helpers, replace the empirical
|
||||
`time.sleep(1.5)` with a readiness poll, and file-lock loopback alias
|
||||
allocation. Addresses the three actionable issues from the #117 hotspot
|
||||
review of `smolmachines/launch.py`.
|
||||
|
||||
## Problem
|
||||
|
||||
### 1. `launch()` step ordering
|
||||
|
||||
`launch()` in `smolmachines/launch.py` is 207 lines. Seven sequenced
|
||||
steps are marked by numbered inline comments (`# 1. Reserve a loopback
|
||||
alias`, `# 2. Mint per-bottle CAs`, ...) — the sequencing is
|
||||
load-bearing (CA paths must be filled before the bundle spec is built;
|
||||
the bundle must be running before port discovery; the VM must be created
|
||||
before the allowlist is patched), but the dependencies are enforced only
|
||||
by linear ordering within one function. Adding a new daemon, changing
|
||||
the port-forward strategy, or debugging a bringup failure requires
|
||||
reading the whole function to understand what state each step produces.
|
||||
Each step is also not individually testable without mocking the entire
|
||||
surrounding context.
|
||||
|
||||
### 2. `time.sleep(1.5)` for libkrun exec-channel race
|
||||
|
||||
After `machine_start`, back-to-back `machine_exec` calls occasionally
|
||||
hit a SIGKILL in libkrun's exec channel at ~100ms. The sleep is
|
||||
documented as "1.5s is empirically enough; provisioning already takes
|
||||
seconds so the wait is amortized." The failure mode if the sleep is
|
||||
insufficient: the filesystem-repair exec (`chown -R node:node /home/node`)
|
||||
is SIGKILLed silently, and the agent later bails with `ENOENT`/`EPERM`
|
||||
when Claude Code tries to write to `~/.claude.json`. A poll-until-ready
|
||||
loop is more robust than a fixed duration: it exits as soon as the exec
|
||||
channel is up, fails loudly with a timeout if the VM never becomes
|
||||
responsive, and is self-documenting about what it is waiting for.
|
||||
|
||||
### 3. Loopback alias allocation is not concurrent-safe
|
||||
|
||||
`loopback_alias.allocate()` reads docker container state to determine
|
||||
which aliases are already in use, then returns the lowest free alias.
|
||||
There is no lock between that read and the bundle's `docker run` (which
|
||||
creates the container that will appear in future `docker ps` output). Two
|
||||
simultaneous bottle launches can both see the same alias as free and
|
||||
claim it, causing both bundles to bind on the same loopback IP. On macOS,
|
||||
where users occasionally start multiple agents in quick succession, this
|
||||
is a realistic failure mode.
|
||||
|
||||
## Non-goals
|
||||
|
||||
- Removing `force_allowlist` / the `--allow-cidr` DB patch. That is a
|
||||
workaround for a smolvm 0.8.0 bug; removal is a one-liner when smolvm
|
||||
honors the CLI flag upstream.
|
||||
- Changing the ephemeral registry / crane detour in `local_registry.py`.
|
||||
Required by Docker Desktop's network topology.
|
||||
- Changing `_ensure_smolmachine`'s cache design. Cache invalidation by
|
||||
docker image ID works; issue #111 tracks a separate stale-sidecar
|
||||
concern.
|
||||
|
||||
## Design
|
||||
|
||||
### 1. Decompose `launch()` into named helpers
|
||||
|
||||
Extract six focused helpers. `launch()` becomes a coordinator that calls
|
||||
them in order, passing the `ExitStack` for teardown registration:
|
||||
|
||||
```
|
||||
_allocate_resources(plan, stack) → (loopback_ip, network)
|
||||
```
|
||||
Reserve the loopback alias, create the docker bridge network, register
|
||||
teardown callbacks for both.
|
||||
|
||||
```
|
||||
_mint_certs(plan) → plan
|
||||
```
|
||||
Pipelock TLS init (always). Egress TLS init when `plan.egress_plan.routes`
|
||||
is non-empty. Returns the plan with CA paths filled via
|
||||
`dataclasses.replace`.
|
||||
|
||||
```
|
||||
_start_bundle(plan, network, loopback_ip, stack) → plan
|
||||
```
|
||||
Build the `BundleLaunchSpec`, resolve token env, start the bundle
|
||||
container, register teardown. Returns the plan with `bundle_spec` updated
|
||||
(or unchanged if no plan field carries it — callers consume `bundle_spec`
|
||||
directly from this call's return value if needed).
|
||||
|
||||
```
|
||||
_discover_urls(plan, loopback_ip) → plan
|
||||
```
|
||||
Look up host-side ports for the published container ports; assemble
|
||||
`agent_proxy_url`, `agent_git_gate_host`, `agent_supervise_url`; stamp
|
||||
them onto the plan and into `guest_env`.
|
||||
|
||||
```
|
||||
_launch_vm(plan, agent_from_path, stack) → None
|
||||
```
|
||||
`machine_create` + `force_allowlist` + `machine_start`. Register
|
||||
`machine_stop` and `machine_delete` teardown callbacks on the stack.
|
||||
|
||||
```
|
||||
_init_vm(plan) → None
|
||||
```
|
||||
Filesystem-repair exec (`chown`/`chmod`) followed by
|
||||
`_wait_exec_ready()`.
|
||||
|
||||
`launch()` reduces to:
|
||||
|
||||
```python
|
||||
loopback_ip, network = _allocate_resources(plan, stack)
|
||||
plan = _mint_certs(plan)
|
||||
plan = _start_bundle(plan, network, loopback_ip, stack)
|
||||
plan = _discover_urls(plan, loopback_ip)
|
||||
agent_from_path = _ensure_smolmachine(plan.agent_image_ref,
|
||||
dockerfile=plan.agent_dockerfile_path)
|
||||
_launch_vm(plan, agent_from_path, stack)
|
||||
_init_vm(plan)
|
||||
prompt_path = provision(plan, plan.machine_name)
|
||||
yield SmolmachinesBottle(...)
|
||||
```
|
||||
|
||||
Each helper's inputs and outputs are explicit; each is independently
|
||||
testable with a minimal set of mocks.
|
||||
|
||||
### 2. Replace `time.sleep(1.5)` with `_wait_exec_ready`
|
||||
|
||||
Add to `smolvm.py`:
|
||||
|
||||
```python
|
||||
def wait_exec_ready(name: str, *, timeout: float = 5.0) -> None:
|
||||
"""Poll until `machine exec true` exits 0 or `timeout` elapses.
|
||||
Replaces a fixed sleep after machine_start for the libkrun
|
||||
exec-channel warm-up race."""
|
||||
deadline = time.monotonic() + timeout
|
||||
delay = 0.1
|
||||
while time.monotonic() < deadline:
|
||||
r = machine_exec(name, ["true"])
|
||||
if r.returncode == 0:
|
||||
return
|
||||
remaining = deadline - time.monotonic()
|
||||
if remaining <= 0:
|
||||
break
|
||||
time.sleep(min(delay, remaining))
|
||||
delay = min(delay * 2, 0.5)
|
||||
die(
|
||||
f"smolvm machine {name!r}: exec channel not ready after "
|
||||
f"{timeout:.0f}s — VM may have failed to boot."
|
||||
)
|
||||
```
|
||||
|
||||
`_init_vm` calls `wait_exec_ready` after the chown/chmod exec instead of
|
||||
`time.sleep(1.5)`. The `time` import in `launch.py` is removed.
|
||||
|
||||
### 3. File-lock loopback alias allocation
|
||||
|
||||
Add to `loopback_alias.py`:
|
||||
|
||||
```python
|
||||
import fcntl
|
||||
|
||||
_ALLOC_LOCK_PATH = Path.home() / ".cache" / "bot-bottle" / "smolmachines.lock"
|
||||
|
||||
def allocate(slug: str) -> str:
|
||||
if not _is_macos():
|
||||
return "127.0.0.1"
|
||||
_ALLOC_LOCK_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(_ALLOC_LOCK_PATH, "w") as lf:
|
||||
fcntl.flock(lf, fcntl.LOCK_EX)
|
||||
return _allocate_locked(slug)
|
||||
|
||||
def _allocate_locked(slug: str) -> str:
|
||||
in_use = _aliases_in_use()
|
||||
for ip in _pool_addresses():
|
||||
if ip not in in_use:
|
||||
return ip
|
||||
die(...)
|
||||
return ""
|
||||
```
|
||||
|
||||
The lock is held only for the duration of `_aliases_in_use()` + the
|
||||
`allocate` return. The bundle's `docker run` runs after the lock is
|
||||
released. This is sufficient: once `docker run` returns, the container
|
||||
is visible in docker state and future `allocate()` calls will see it.
|
||||
The remaining window (lock released → container appears in docker state)
|
||||
is narrowed from "the entire bringup sequence" to "a single subprocess
|
||||
call," making a collision between two concurrent launches effectively
|
||||
impossible in practice.
|
||||
|
||||
The lock is a no-op on Linux (the `_is_macos()` early-return fires
|
||||
before the lock path is opened).
|
||||
|
||||
## Test impact
|
||||
|
||||
- Unit tests for each extracted helper can mock one subprocess boundary
|
||||
at a time (smolvm, docker, pipelock TLS init) without wiring the full
|
||||
`launch()` ExitStack.
|
||||
- `wait_exec_ready` needs a test with `machine_exec` stubbed to return
|
||||
non-zero N times before 0 — verifies the backoff loop and the timeout
|
||||
die path.
|
||||
- `allocate` tests are unchanged in shape; the lock is acquired and
|
||||
released within the call so tests don't need to be aware of it.
|
||||
|
||||
## Implementation chunks
|
||||
|
||||
1. **PRD (this commit).** Sets the design.
|
||||
2. **Decompose `launch()`.**
|
||||
3. **Replace sleep with `wait_exec_ready`.**
|
||||
4. **File-lock `allocate()`.**
|
||||
5. **Tests.** Unit tests for each helper; `wait_exec_ready` backoff + timeout.
|
||||
|
||||
## References
|
||||
|
||||
- Issue #122: Decompose smolmachines launch and harden bringup sequencing.
|
||||
- Issue #117: Complexity hotspots — source of the smolmachines/launch.py finding.
|
||||
- Issue #111: Smolmachine sidecar doesn't reliably get refreshed (separate, not addressed here).
|
||||
@@ -11,6 +11,7 @@ import json
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import tempfile
|
||||
import threading
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
@@ -144,6 +145,55 @@ class TestAllocate(unittest.TestCase):
|
||||
loopback_alias.allocate("demo-overflow")
|
||||
|
||||
|
||||
class TestAllocateLock(unittest.TestCase):
|
||||
"""allocate() on macOS acquires a file lock so concurrent calls
|
||||
serialise rather than racing on docker state."""
|
||||
|
||||
def test_acquires_exclusive_lock_on_macos(self):
|
||||
import fcntl as fcntl_mod
|
||||
flock_calls: list[int] = []
|
||||
|
||||
def record_flock(fd, op):
|
||||
flock_calls.append(op)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
lock_path = Path(tmp) / "smolmachines.lock"
|
||||
with patch.object(loopback_alias, "_is_macos", return_value=True), \
|
||||
patch.object(loopback_alias, "_ALLOC_LOCK_PATH", lock_path), \
|
||||
patch.object(loopback_alias, "_aliases_in_use", return_value=set()), \
|
||||
patch.object(loopback_alias.fcntl, "flock",
|
||||
side_effect=record_flock):
|
||||
loopback_alias.allocate("demo")
|
||||
|
||||
self.assertIn(fcntl_mod.LOCK_EX, flock_calls)
|
||||
|
||||
def test_no_lock_on_linux(self):
|
||||
# Linux early-returns before touching the lock file.
|
||||
with patch.object(loopback_alias, "_is_macos", return_value=False), \
|
||||
patch.object(loopback_alias.fcntl, "flock") as flock:
|
||||
loopback_alias.allocate("demo")
|
||||
flock.assert_not_called()
|
||||
|
||||
def test_sequential_allocations_with_shared_lock_are_serialised(self):
|
||||
# Two sequential calls share the same lock file. The second
|
||||
# call sees {127.0.0.16} in use (as if the first caller's
|
||||
# docker run completed between the two lock acquisitions) and
|
||||
# returns the next alias.
|
||||
in_use_seq = [set(), {"127.0.0.16"}]
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
lock_path = Path(tmp) / "smolmachines.lock"
|
||||
results: list[str] = []
|
||||
for _ in range(2):
|
||||
with patch.object(loopback_alias, "_is_macos", return_value=True), \
|
||||
patch.object(loopback_alias, "_ALLOC_LOCK_PATH", lock_path), \
|
||||
patch.object(loopback_alias, "_aliases_in_use",
|
||||
return_value=in_use_seq.pop(0)):
|
||||
results.append(loopback_alias.allocate("demo"))
|
||||
|
||||
self.assertEqual(["127.0.0.16", "127.0.0.17"], results)
|
||||
|
||||
|
||||
class TestAliasInUseDetection(unittest.TestCase):
|
||||
"""`_aliases_in_use` inspects every running bundle and pulls
|
||||
each container's port-binding `HostIp` out. The detection has
|
||||
|
||||
@@ -12,6 +12,7 @@ import unittest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
from bot_bottle.backend.smolmachines import smolvm as smolvm_mod
|
||||
from bot_bottle.backend.smolmachines.smolvm import (
|
||||
SmolvmError,
|
||||
SmolvmRunResult,
|
||||
@@ -23,6 +24,7 @@ from bot_bottle.backend.smolmachines.smolvm import (
|
||||
machine_start,
|
||||
machine_stop,
|
||||
pack_create,
|
||||
wait_exec_ready,
|
||||
)
|
||||
|
||||
|
||||
@@ -204,6 +206,43 @@ class TestErrorPath(unittest.TestCase):
|
||||
self.assertEqual(SmolvmRunResult(42, "", "nope"), r)
|
||||
|
||||
|
||||
class TestWaitExecReady(unittest.TestCase):
|
||||
"""wait_exec_ready polls machine_exec(name, ["true"]) until it
|
||||
returns 0, then exits. On timeout it calls die()."""
|
||||
|
||||
def test_returns_immediately_when_exec_succeeds_first_try(self):
|
||||
with patch.object(smolvm_mod, "machine_exec",
|
||||
return_value=SmolvmRunResult(0, "", "")) as m:
|
||||
wait_exec_ready("vm-x")
|
||||
m.assert_called_once_with("vm-x", ["true"])
|
||||
|
||||
def test_retries_on_nonzero_and_returns_on_success(self):
|
||||
results = [
|
||||
SmolvmRunResult(1, "", "not ready"),
|
||||
SmolvmRunResult(1, "", "not ready"),
|
||||
SmolvmRunResult(0, "", ""),
|
||||
]
|
||||
with patch.object(smolvm_mod, "machine_exec",
|
||||
side_effect=results) as m, \
|
||||
patch.object(smolvm_mod.time, "sleep"):
|
||||
wait_exec_ready("vm-x")
|
||||
self.assertEqual(3, m.call_count)
|
||||
|
||||
def test_raises_smolvm_error_on_timeout(self):
|
||||
# machine_exec always returns non-zero; monotonic advances past
|
||||
# the deadline after the first sleep so the loop exits.
|
||||
ticks = [0.0, 0.0, 10.0] # third call puts us past deadline
|
||||
with patch.object(smolvm_mod, "machine_exec",
|
||||
return_value=SmolvmRunResult(1, "", "")), \
|
||||
patch.object(smolvm_mod.time, "monotonic",
|
||||
side_effect=ticks), \
|
||||
patch.object(smolvm_mod.time, "sleep"):
|
||||
with self.assertRaises(SmolvmError) as cm:
|
||||
wait_exec_ready("vm-x", timeout=5.0)
|
||||
self.assertIn("vm-x", str(cm.exception))
|
||||
self.assertIn("not ready", str(cm.exception))
|
||||
|
||||
|
||||
class TestIsAvailable(unittest.TestCase):
|
||||
def test_true_when_on_path(self):
|
||||
with patch(
|
||||
|
||||
Reference in New Issue
Block a user
Will this crash the dashboard?
If so, this should be some kind of error we raise instead, and some other handler should decide whether or not it dies.