PRD 0032: Decompose smolmachines launch and harden bringup sequencing #123
@@ -21,7 +21,6 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import os
|
import os
|
||||||
import time
|
|
||||||
from contextlib import ExitStack, contextmanager
|
from contextlib import ExitStack, contextmanager
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Callable, Generator
|
from typing import Callable, Generator
|
||||||
@@ -94,25 +93,60 @@ def launch(
|
|||||||
via the ExitStack."""
|
via the ExitStack."""
|
||||||
stack = ExitStack()
|
stack = ExitStack()
|
||||||
try:
|
try:
|
||||||
# 1. Reserve a loopback alias for this bottle. macOS only
|
loopback_ip, network = _allocate_resources(plan, stack)
|
||||||
# routes 127.0.0.1 by default; the per-bottle alias is
|
plan = _mint_certs(plan)
|
||||||
# what bundles the docker port-publishes and TSI allowlist
|
plan = _start_bundle(plan, network, loopback_ip, stack)
|
||||||
# against, so this bottle can't reach other bottles' (or
|
plan = _discover_urls(plan, loopback_ip)
|
||||||
# other host services') ports on the loopback. Lazy
|
|
||||||
# sudo-driven on first use per boot. No-op on Linux.
|
# Build the agent image and pack it into a `.smolmachine`
|
||||||
|
# artifact (or hit the per-Dockerfile-digest cache). Runs
|
||||||
|
# here, not in prepare, so the docker-build output doesn't
|
||||||
|
# garble the dashboard's preflight modal.
|
||||||
|
agent_from_path = _ensure_smolmachine(
|
||||||
|
plan.agent_image_ref,
|
||||||
|
dockerfile=plan.agent_dockerfile_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
_launch_vm(plan, agent_from_path, loopback_ip, stack)
|
||||||
|
_init_vm(plan)
|
||||||
|
|
||||||
|
prompt_path = provision(plan, plan.machine_name)
|
||||||
|
|
||||||
|
yield SmolmachinesBottle(
|
||||||
|
plan.machine_name,
|
||||||
|
prompt_path=prompt_path,
|
||||||
|
guest_env=plan.guest_env,
|
||||||
|
agent_command=plan.agent_command,
|
||||||
|
agent_prompt_mode=plan.agent_prompt_mode,
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
stack.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _allocate_resources(
|
||||||
|
plan: SmolmachinesBottlePlan,
|
||||||
|
stack: ExitStack,
|
||||||
|
) -> tuple[str, str]:
|
||||||
|
"""Reserve a loopback alias and create the per-bottle docker bridge.
|
||||||
|
|
||||||
|
macOS only routes 127.0.0.1 by default; the per-bottle alias
|
||||||
|
scopes TSI's allowlist to this bottle's published ports so the
|
||||||
|
agent can't reach other bottles' or host services' ports on
|
||||||
|
loopback. No-op on Linux."""
|
||||||
_loopback.ensure_pool()
|
_loopback.ensure_pool()
|
||||||
loopback_ip = _loopback.allocate(plan.slug)
|
loopback_ip = _loopback.allocate(plan.slug)
|
||||||
|
|
||||||
# 2. Per-bottle docker bridge.
|
|
||||||
network = _bundle.bundle_network_name(plan.slug)
|
network = _bundle.bundle_network_name(plan.slug)
|
||||||
_bundle.create_bundle_network(network, plan.bundle_subnet, plan.bundle_gateway)
|
_bundle.create_bundle_network(network, plan.bundle_subnet, plan.bundle_gateway)
|
||||||
stack.callback(_bundle.remove_bundle_network, network)
|
stack.callback(_bundle.remove_bundle_network, network)
|
||||||
|
return loopback_ip, network
|
||||||
|
|
||||||
# 2. Mint per-bottle CAs and update the inner Plans with
|
|
||||||
# their launch-time paths. pipelock always runs in the
|
def _mint_certs(plan: SmolmachinesBottlePlan) -> SmolmachinesBottlePlan:
|
||||||
# bundle; egress's CA is only minted when the bottle
|
"""Mint per-bottle CAs and return the plan with CA paths filled.
|
||||||
# declares routes (otherwise egress runs idle without
|
|
||||||
# MITM and the CA files would be unused).
|
Pipelock always runs in the bundle. Egress's CA is only minted
|
||||||
|
when the bottle declares routes — otherwise egress runs idle
|
||||||
|
without MITM and the CA files would be unused."""
|
||||||
ca_cert_host, ca_key_host = pipelock_tls_init(plan.proxy_plan.yaml_path.parent)
|
ca_cert_host, ca_key_host = pipelock_tls_init(plan.proxy_plan.yaml_path.parent)
|
||||||
proxy_plan = dataclasses.replace(
|
proxy_plan = dataclasses.replace(
|
||||||
plan.proxy_plan,
|
plan.proxy_plan,
|
||||||
@@ -129,41 +163,47 @@ def launch(
|
|||||||
mitmproxy_ca_host_path=egress_ca_host,
|
mitmproxy_ca_host_path=egress_ca_host,
|
||||||
mitmproxy_ca_cert_only_host_path=egress_ca_cert_only,
|
mitmproxy_ca_cert_only_host_path=egress_ca_cert_only,
|
||||||
pipelock_ca_host_path=ca_cert_host,
|
pipelock_ca_host_path=ca_cert_host,
|
||||||
# On smolmachines, egress's upstream is pipelock
|
# On smolmachines, egress's upstream is pipelock on the
|
||||||
# on the bundle's localhost — they're in the same
|
# bundle's localhost — they're in the same container's
|
||||||
# container's network namespace.
|
# network namespace.
|
||||||
pipelock_proxy_url=BUNDLE_LOCAL_PIPELOCK_URL,
|
pipelock_proxy_url=BUNDLE_LOCAL_PIPELOCK_URL,
|
||||||
)
|
)
|
||||||
plan = dataclasses.replace(
|
return dataclasses.replace(plan, proxy_plan=proxy_plan, egress_plan=egress_plan)
|
||||||
plan, proxy_plan=proxy_plan, egress_plan=egress_plan,
|
|
||||||
)
|
|
||||||
|
|
||||||
# 3. Build the BundleLaunchSpec from the (now-resolved)
|
|
||||||
# inner Plans: daemon subset, env, bind-mounts, and the
|
def _start_bundle(
|
||||||
# loopback alias to bind published ports against. The
|
plan: SmolmachinesBottlePlan,
|
||||||
# spec's ports_to_publish list expands depending on which
|
network: str,
|
||||||
# daemons the agent needs to reach from the smolvm guest.
|
loopback_ip: str,
|
||||||
|
stack: ExitStack,
|
||||||
|
) -> SmolmachinesBottlePlan:
|
||||||
|
"""Build the BundleLaunchSpec, resolve token env, start the
|
||||||
|
sidecar bundle container, and register teardown."""
|
||||||
bundle_spec = _bundle_launch_spec(plan, network, loopback_ip)
|
bundle_spec = _bundle_launch_spec(plan, network, loopback_ip)
|
||||||
token_env = _resolve_token_env(plan, dict(os.environ))
|
token_env = _resolve_token_env(plan, dict(os.environ))
|
||||||
_bundle.ensure_bundle_image(bundle_spec.image)
|
_bundle.ensure_bundle_image(bundle_spec.image)
|
||||||
_bundle.start_bundle(bundle_spec, env={**os.environ, **token_env})
|
_bundle.start_bundle(bundle_spec, env={**os.environ, **token_env})
|
||||||
stack.callback(_bundle.stop_bundle, plan.slug)
|
stack.callback(_bundle.stop_bundle, plan.slug)
|
||||||
|
return plan
|
||||||
|
|
||||||
# 4. Discover the host-side ports docker assigned for the
|
|
||||||
# bundle's published container ports, and bind the
|
def _discover_urls(
|
||||||
# agent's URLs to `<loopback_ip>:<host port>`. Docker
|
plan: SmolmachinesBottlePlan,
|
||||||
# container IPs (192.168.x.x in the daemon's bridge)
|
loopback_ip: str,
|
||||||
# aren't reachable from the smolvm guest on macOS — TSI
|
) -> SmolmachinesBottlePlan:
|
||||||
# uses macOS networking, and macOS sees the daemon's
|
"""Discover host-side ports for published container ports and
|
||||||
# bridge via the published-port loopback forward only.
|
return the plan with URLs + guest_env stamped in.
|
||||||
#
|
|
||||||
# Proxy hop order matches the docker backend: when the
|
Docker container IPs (192.168.x.x in the daemon's bridge)
|
||||||
# bottle declares egress routes, the agent's first hop is
|
aren't reachable from the smolvm guest on macOS — TSI uses
|
||||||
# egress (for token injection), then pipelock. Without
|
macOS networking, and macOS sees the daemon's bridge via the
|
||||||
# routes, the agent dials pipelock directly. Whichever
|
published-port loopback forward only.
|
||||||
# one is "agent-facing" is the daemon whose port we
|
|
||||||
# publish on host loopback; the other stays bundle-
|
Proxy hop order: when the bottle declares egress routes, the
|
||||||
# internal as the upstream proxy.
|
agent's first hop is egress (for token injection), then
|
||||||
|
pipelock. Without routes, the agent dials pipelock directly.
|
||||||
|
NO_PROXY includes the per-bottle loopback alias so the
|
||||||
|
supervise + git-gate URLs bypass HTTPS_PROXY."""
|
||||||
if plan.egress_plan.routes:
|
if plan.egress_plan.routes:
|
||||||
agent_facing_port = _EGRESS_PORT
|
agent_facing_port = _EGRESS_PORT
|
||||||
else:
|
else:
|
||||||
@@ -172,12 +212,14 @@ def launch(
|
|||||||
plan.slug, agent_facing_port, host_ip=loopback_ip,
|
plan.slug, agent_facing_port, host_ip=loopback_ip,
|
||||||
)
|
)
|
||||||
agent_proxy_url = f"http://{loopback_ip}:{agent_facing_host_port}"
|
agent_proxy_url = f"http://{loopback_ip}:{agent_facing_host_port}"
|
||||||
|
|
||||||
agent_git_gate_host = ""
|
agent_git_gate_host = ""
|
||||||
if plan.git_gate_plan.upstreams:
|
if plan.git_gate_plan.upstreams:
|
||||||
git_gate_host_port = _bundle.bundle_host_port(
|
git_gate_host_port = _bundle.bundle_host_port(
|
||||||
plan.slug, _GIT_HTTP_PORT, host_ip=loopback_ip,
|
plan.slug, _GIT_HTTP_PORT, host_ip=loopback_ip,
|
||||||
)
|
)
|
||||||
agent_git_gate_host = f"{loopback_ip}:{git_gate_host_port}"
|
agent_git_gate_host = f"{loopback_ip}:{git_gate_host_port}"
|
||||||
|
|
||||||
agent_supervise_url = ""
|
agent_supervise_url = ""
|
||||||
if plan.supervise_plan is not None:
|
if plan.supervise_plan is not None:
|
||||||
supervise_host_port = _bundle.bundle_host_port(
|
supervise_host_port = _bundle.bundle_host_port(
|
||||||
@@ -185,20 +227,6 @@ def launch(
|
|||||||
)
|
)
|
||||||
agent_supervise_url = f"http://{loopback_ip}:{supervise_host_port}/"
|
agent_supervise_url = f"http://{loopback_ip}:{supervise_host_port}/"
|
||||||
|
|
||||||
# Stamp the URLs onto the plan + guest_env. provision_git
|
|
||||||
# and provision_supervise read the plan fields; the agent
|
|
||||||
# reads guest_env on every exec_agent.
|
|
||||||
#
|
|
||||||
# NO_PROXY has to include the per-bottle loopback alias —
|
|
||||||
# otherwise claude's HTTPS_PROXY catches direct calls to
|
|
||||||
# the supervise URL (`http://<alias>:<port>/`) and proxies
|
|
||||||
# them through egress, which has no route for the alias
|
|
||||||
# and rejects with "Failed to connect". The smolmachines
|
|
||||||
# git-gate URL uses smart HTTP, so it also has to bypass
|
|
||||||
# the agent's HTTP_PROXY and go straight to the host-
|
|
||||||
# published git HTTP endpoint. Append rather than overwrite
|
|
||||||
# so prepare.py's
|
|
||||||
# `localhost,127.0.0.1` baseline stays in place.
|
|
||||||
existing_no_proxy = plan.guest_env.get("NO_PROXY", "localhost,127.0.0.1")
|
existing_no_proxy = plan.guest_env.get("NO_PROXY", "localhost,127.0.0.1")
|
||||||
guest_env = {
|
guest_env = {
|
||||||
**plan.guest_env,
|
**plan.guest_env,
|
||||||
@@ -210,7 +238,8 @@ def launch(
|
|||||||
guest_env["GIT_GATE_URL"] = f"http://{agent_git_gate_host}"
|
guest_env["GIT_GATE_URL"] = f"http://{agent_git_gate_host}"
|
||||||
if agent_supervise_url:
|
if agent_supervise_url:
|
||||||
guest_env["MCP_SUPERVISE_URL"] = agent_supervise_url
|
guest_env["MCP_SUPERVISE_URL"] = agent_supervise_url
|
||||||
plan = dataclasses.replace(
|
|
||||||
|
return dataclasses.replace(
|
||||||
plan,
|
plan,
|
||||||
guest_env=guest_env,
|
guest_env=guest_env,
|
||||||
agent_proxy_url=agent_proxy_url,
|
agent_proxy_url=agent_proxy_url,
|
||||||
@@ -218,25 +247,20 @@ def launch(
|
|||||||
agent_supervise_url=agent_supervise_url,
|
agent_supervise_url=agent_supervise_url,
|
||||||
)
|
)
|
||||||
|
|
||||||
# 5. Build the agent image and pack it into a
|
|
||||||
# `.smolmachine` artifact (or hit the per-Dockerfile-digest
|
|
||||||
# cache). Runs here, not in prepare, so the docker-build
|
|
||||||
# output doesn't garble the dashboard's preflight modal:
|
|
||||||
# both the curses-endwin path and the tmux pane-routing
|
|
||||||
# path redirect stderr around `launch` already.
|
|
||||||
agent_from_path = _ensure_smolmachine(
|
|
||||||
plan.agent_image_ref,
|
|
||||||
dockerfile=plan.agent_dockerfile_path,
|
|
||||||
)
|
|
||||||
|
|
||||||
# smolvm VM. --from carries the pre-packed .smolmachine
|
def _launch_vm(
|
||||||
# artifact; --allow-cidr + -e carry the per-bottle TSI
|
plan: SmolmachinesBottlePlan,
|
||||||
# allowlist + env. The allowlist is the per-bottle
|
agent_from_path: Path,
|
||||||
# loopback alias — narrowing it to one /32 keeps the
|
loopback_ip: str,
|
||||||
# agent from reaching other host loopback services or
|
stack: ExitStack,
|
||||||
# other bottles' published ports. Smolfile isn't usable
|
) -> None:
|
||||||
# here — smolvm 0.8.0 makes `--from` and `--smolfile`
|
"""Create, patch, and start the smolvm VM; register teardown.
|
||||||
# mutually exclusive.
|
|
||||||
|
--allow-cidr is the per-bottle loopback alias so the guest can
|
||||||
|
only reach this bottle's bundle ports. force_allowlist patches
|
||||||
|
smolvm 0.8.0's silent-drop of --allow-cidr when combined with
|
||||||
|
--from. Smolfile isn't usable here — smolvm 0.8.0 makes --from
|
||||||
|
and --smolfile mutually exclusive."""
|
||||||
_smolvm.machine_create(
|
_smolvm.machine_create(
|
||||||
plan.machine_name,
|
plan.machine_name,
|
||||||
from_path=agent_from_path,
|
from_path=agent_from_path,
|
||||||
@@ -244,61 +268,33 @@ def launch(
|
|||||||
env=plan.guest_env,
|
env=plan.guest_env,
|
||||||
)
|
)
|
||||||
stack.callback(_smolvm.machine_delete, plan.machine_name)
|
stack.callback(_smolvm.machine_delete, plan.machine_name)
|
||||||
# Workaround smolvm 0.8.0: `--allow-cidr` is silently
|
# Workaround smolvm 0.8.0: `--allow-cidr` is silently dropped
|
||||||
# dropped when combined with `--from`. Patch the persisted
|
# when combined with `--from`. Patch the persisted state DB
|
||||||
# state DB to set the allowlist before start so the booted
|
# before start so the booted VM's TSI actually enforces.
|
||||||
# VM's TSI actually enforces. See loopback_alias's module
|
|
||||||
# docstring for the investigation that led here.
|
|
||||||
_loopback.force_allowlist(plan.machine_name, [f"{loopback_ip}/32"])
|
_loopback.force_allowlist(plan.machine_name, [f"{loopback_ip}/32"])
|
||||||
_smolvm.machine_start(plan.machine_name)
|
_smolvm.machine_start(plan.machine_name)
|
||||||
stack.callback(_smolvm.machine_stop, plan.machine_name)
|
stack.callback(_smolvm.machine_stop, plan.machine_name)
|
||||||
|
|
||||||
# 6. Repair filesystem ownership + perms that smolvm's
|
|
||||||
# pack process remapped to the host invoker's uid (501
|
def _init_vm(plan: SmolmachinesBottlePlan) -> None:
|
||||||
# on macOS) rather than preserving the image's expected
|
"""Repair filesystem ownership and wait for exec channel readiness.
|
||||||
# ownership.
|
|
||||||
#
|
Ownership repair: smolvm's pack process remaps files to the host
|
||||||
# - /home/node → node:node so the node user can write
|
invoker's uid (501 on macOS). /home/node must be node:node so
|
||||||
# its own dotfiles (claude appendFileSync on
|
Claude Code can write ~/.claude.json; /tmp + /var/tmp need root
|
||||||
# ~/.claude.json otherwise bails with ENOENT/EPERM
|
mode 1777 so non-root processes can create per-uid scratch dirs.
|
||||||
# and the TUI hangs without surfacing the error).
|
All folded into one sh -c to avoid back-to-back exec calls
|
||||||
# - /tmp + /var/tmp → root:root mode 1777 so non-root
|
immediately after machine_start (libkrun exec-channel race).
|
||||||
# processes can create their per-uid scratch dirs
|
|
||||||
# (claude-code creates /tmp/claude-<uid>/ as soon as
|
wait_exec_ready polls until the exec channel is ready for the
|
||||||
# it spawns a Bash tool call).
|
subsequent provision calls, replacing the empirical sleep."""
|
||||||
#
|
|
||||||
# All folded into one sh -c so we only pay one
|
|
||||||
# machine_exec round trip — back-to-back exec calls
|
|
||||||
# right after machine_start hit a SIGKILL race in
|
|
||||||
# libkrun's exec channel (see provision_ca for the
|
|
||||||
# other half of this same workaround).
|
|
||||||
_smolvm.machine_exec(plan.machine_name, [
|
_smolvm.machine_exec(plan.machine_name, [
|
||||||
"sh", "-c",
|
"sh", "-c",
|
||||||
"chown -R node:node /home/node && "
|
"chown -R node:node /home/node && "
|
||||||
"chown root:root /tmp /var/tmp && "
|
"chown root:root /tmp /var/tmp && "
|
||||||
"chmod 1777 /tmp /var/tmp",
|
"chmod 1777 /tmp /var/tmp",
|
||||||
])
|
])
|
||||||
|
_smolvm.wait_exec_ready(plan.machine_name)
|
||||||
# Wait briefly for the VM to settle. Back-to-back smolvm
|
|
||||||
# machine_exec calls immediately after machine_start
|
|
||||||
# occasionally SIGKILL the in-VM child at ~100ms (looks
|
|
||||||
# like a VM warm-up race in libkrun's exec channel).
|
|
||||||
# 1.5s is empirically enough to dodge it; provisioning
|
|
||||||
# already takes seconds so the wait is amortized.
|
|
||||||
time.sleep(1.5)
|
|
||||||
|
|
||||||
# 7. Provision (CA / prompt / skills / git / supervise).
|
|
||||||
prompt_path = provision(plan, plan.machine_name)
|
|
||||||
|
|
||||||
yield SmolmachinesBottle(
|
|
||||||
plan.machine_name,
|
|
||||||
prompt_path=prompt_path,
|
|
||||||
guest_env=plan.guest_env,
|
|
||||||
agent_command=plan.agent_command,
|
|
||||||
agent_prompt_mode=plan.agent_prompt_mode,
|
|
||||||
)
|
|
||||||
finally:
|
|
||||||
stack.close()
|
|
||||||
|
|
||||||
|
|
||||||
def _bundle_launch_spec(
|
def _bundle_launch_spec(
|
||||||
@@ -324,10 +320,9 @@ def _bundle_launch_spec(
|
|||||||
# is "agent-facing" gets its port published on the host
|
# is "agent-facing" gets its port published on the host
|
||||||
# loopback (see `_ensure_smolmachine`'s discovery loop) and the
|
# loopback (see `_ensure_smolmachine`'s discovery loop) and the
|
||||||
# other stays bundle-internal. The bundle is NOT reachable by
|
# other stays bundle-internal. The bundle is NOT reachable by
|
||||||
# bridge IP from the smolvm guest, so the
|
# bridge IP from the smolvm guest on macOS — TSI uses macOS
|
||||||
# PRD-0023-chunk-3 EGRESS_LISTEN_HOST=127.0.0.1 mitigation
|
# networking, and macOS sees the daemon's bridge via the
|
||||||
# isn't needed: the agent can only dial whatever daemon's
|
# published-port loopback forward only.
|
||||||
# host port we publish, period.
|
|
||||||
|
|
||||||
# --- pipelock ---------------------------------------------
|
# --- pipelock ---------------------------------------------
|
||||||
pp = plan.proxy_plan
|
pp = plan.proxy_plan
|
||||||
|
|||||||
@@ -45,6 +45,7 @@ alias gets handed to a new bottle."""
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import fcntl
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import platform
|
import platform
|
||||||
@@ -83,6 +84,14 @@ _POOL_START = 16
|
|||||||
_POOL_END = 31 # inclusive
|
_POOL_END = 31 # inclusive
|
||||||
|
|
||||||
|
|
||||||
|
# File lock that serialises concurrent allocate() calls so two
|
||||||
|
# simultaneous launches can't read the same docker state and claim
|
||||||
|
# the same alias. Narrowed to the allocate() call itself; docker run
|
||||||
|
# runs after the lock is released. Once the container is running it
|
||||||
|
# appears in docker state and future allocate() calls will see it.
|
||||||
|
_ALLOC_LOCK_PATH = Path.home() / ".cache" / "bot-bottle" / "smolmachines.lock"
|
||||||
|
|
||||||
|
|
||||||
# Loopback aliases pool: 127.0.0.<start>..127.0.0.<end>.
|
# Loopback aliases pool: 127.0.0.<start>..127.0.0.<end>.
|
||||||
def _pool_addresses() -> list[str]:
|
def _pool_addresses() -> list[str]:
|
||||||
return [f"127.0.0.{i}" for i in range(_POOL_START, _POOL_END + 1)]
|
return [f"127.0.0.{i}" for i in range(_POOL_START, _POOL_END + 1)]
|
||||||
@@ -179,9 +188,20 @@ def allocate(slug: str) -> str:
|
|||||||
On non-macOS the whole `127.0.0.0/8` is loopback by default;
|
On non-macOS the whole `127.0.0.0/8` is loopback by default;
|
||||||
`127.0.0.1` is fine to share and we skip the alias dance.
|
`127.0.0.1` is fine to share and we skip the alias dance.
|
||||||
This still returns a deterministic address so launch.py's
|
This still returns a deterministic address so launch.py's
|
||||||
callers don't have to branch on platform."""
|
callers don't have to branch on platform.
|
||||||
|
|
||||||
|
An exclusive file lock serialises concurrent calls so two
|
||||||
|
simultaneous launches don't read the same docker state and
|
||||||
|
claim the same alias."""
|
||||||
if not _is_macos():
|
if not _is_macos():
|
||||||
return "127.0.0.1"
|
return "127.0.0.1"
|
||||||
|
_ALLOC_LOCK_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(_ALLOC_LOCK_PATH, "w") as lf:
|
||||||
|
fcntl.flock(lf, fcntl.LOCK_EX)
|
||||||
|
return _allocate_locked()
|
||||||
|
|
||||||
|
|
||||||
|
def _allocate_locked() -> str:
|
||||||
in_use = _aliases_in_use()
|
in_use = _aliases_in_use()
|
||||||
for ip in _pool_addresses():
|
for ip in _pool_addresses():
|
||||||
if ip not in in_use:
|
if ip not in in_use:
|
||||||
|
|||||||
@@ -27,11 +27,13 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import time
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Mapping, Sequence
|
from typing import Mapping, Sequence
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
_SMOLVM = "smolvm"
|
_SMOLVM = "smolvm"
|
||||||
|
|
||||||
|
|
||||||
@@ -197,6 +199,34 @@ def machine_exec(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def wait_exec_ready(name: str, *, timeout: float = 5.0) -> None:
|
||||||
|
"""Poll `machine exec true` until exit 0 or `timeout` elapses.
|
||||||
|
|
||||||
|
Replaces `time.sleep(1.5)` after `machine_start`: libkrun's exec
|
||||||
|
channel needs a brief warm-up before back-to-back exec calls are
|
||||||
|
safe. Polling exits as soon as the channel is ready and fails
|
||||||
|
loudly if the VM never responds."""
|
||||||
|
deadline = time.monotonic() + timeout
|
||||||
|
delay = 0.1
|
||||||
|
while time.monotonic() < deadline:
|
||||||
|
r = machine_exec(name, ["true"])
|
||||||
|
if r.returncode == 0:
|
||||||
|
return
|
||||||
|
remaining = deadline - time.monotonic()
|
||||||
|
if remaining <= 0:
|
||||||
|
break
|
||||||
|
time.sleep(min(delay, remaining))
|
||||||
|
delay = min(delay * 2, 0.5)
|
||||||
|
argv = ["smolvm", "machine", "exec", "--name", name, "--", "true"]
|
||||||
|
raise SmolvmError(
|
||||||
|
|
|||||||
|
argv,
|
||||||
|
subprocess.CompletedProcess(
|
||||||
|
args=argv, returncode=-1, stdout="",
|
||||||
|
stderr=f"exec channel not ready after {timeout:.0f}s — VM may have failed to boot.",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def machine_cp(src: str, dst: str) -> None:
|
def machine_cp(src: str, dst: str) -> None:
|
||||||
"""`smolvm machine cp SRC DST`. Path syntax: `machine:path` to
|
"""`smolvm machine cp SRC DST`. Path syntax: `machine:path` to
|
||||||
reference a path inside the VM, bare path for the host. Both
|
reference a path inside the VM, bare path for the host. Both
|
||||||
|
|||||||
@@ -0,0 +1,221 @@
|
|||||||
|
# PRD 0032: Decompose smolmachines launch and harden bringup sequencing
|
||||||
|
|
||||||
|
- **Status:** Active
|
||||||
|
- **Author:** didericis-claude
|
||||||
|
- **Created:** 2026-06-02
|
||||||
|
- **Issue:** #122
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
Split `launch()` into named per-step helpers, replace the empirical
|
||||||
|
`time.sleep(1.5)` with a readiness poll, and file-lock loopback alias
|
||||||
|
allocation. Addresses the three actionable issues from the #117 hotspot
|
||||||
|
review of `smolmachines/launch.py`.
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
|
||||||
|
### 1. `launch()` step ordering
|
||||||
|
|
||||||
|
`launch()` in `smolmachines/launch.py` is 207 lines. Seven sequenced
|
||||||
|
steps are marked by numbered inline comments (`# 1. Reserve a loopback
|
||||||
|
alias`, `# 2. Mint per-bottle CAs`, ...) — the sequencing is
|
||||||
|
load-bearing (CA paths must be filled before the bundle spec is built;
|
||||||
|
the bundle must be running before port discovery; the VM must be created
|
||||||
|
before the allowlist is patched), but the dependencies are enforced only
|
||||||
|
by linear ordering within one function. Adding a new daemon, changing
|
||||||
|
the port-forward strategy, or debugging a bringup failure requires
|
||||||
|
reading the whole function to understand what state each step produces.
|
||||||
|
Each step is also not individually testable without mocking the entire
|
||||||
|
surrounding context.
|
||||||
|
|
||||||
|
### 2. `time.sleep(1.5)` for libkrun exec-channel race
|
||||||
|
|
||||||
|
After `machine_start`, back-to-back `machine_exec` calls occasionally
|
||||||
|
hit a SIGKILL in libkrun's exec channel at ~100ms. The sleep is
|
||||||
|
documented as "1.5s is empirically enough; provisioning already takes
|
||||||
|
seconds so the wait is amortized." The failure mode if the sleep is
|
||||||
|
insufficient: the filesystem-repair exec (`chown -R node:node /home/node`)
|
||||||
|
is SIGKILLed silently, and the agent later bails with `ENOENT`/`EPERM`
|
||||||
|
when Claude Code tries to write to `~/.claude.json`. A poll-until-ready
|
||||||
|
loop is more robust than a fixed duration: it exits as soon as the exec
|
||||||
|
channel is up, fails loudly with a timeout if the VM never becomes
|
||||||
|
responsive, and is self-documenting about what it is waiting for.
|
||||||
|
|
||||||
|
### 3. Loopback alias allocation is not concurrent-safe
|
||||||
|
|
||||||
|
`loopback_alias.allocate()` reads docker container state to determine
|
||||||
|
which aliases are already in use, then returns the lowest free alias.
|
||||||
|
There is no lock between that read and the bundle's `docker run` (which
|
||||||
|
creates the container that will appear in future `docker ps` output). Two
|
||||||
|
simultaneous bottle launches can both see the same alias as free and
|
||||||
|
claim it, causing both bundles to bind on the same loopback IP. On macOS,
|
||||||
|
where users occasionally start multiple agents in quick succession, this
|
||||||
|
is a realistic failure mode.
|
||||||
|
|
||||||
|
## Non-goals
|
||||||
|
|
||||||
|
- Removing `force_allowlist` / the `--allow-cidr` DB patch. That is a
|
||||||
|
workaround for a smolvm 0.8.0 bug; removal is a one-liner when smolvm
|
||||||
|
honors the CLI flag upstream.
|
||||||
|
- Changing the ephemeral registry / crane detour in `local_registry.py`.
|
||||||
|
Required by Docker Desktop's network topology.
|
||||||
|
- Changing `_ensure_smolmachine`'s cache design. Cache invalidation by
|
||||||
|
docker image ID works; issue #111 tracks a separate stale-sidecar
|
||||||
|
concern.
|
||||||
|
|
||||||
|
## Design
|
||||||
|
|
||||||
|
### 1. Decompose `launch()` into named helpers
|
||||||
|
|
||||||
|
Extract six focused helpers. `launch()` becomes a coordinator that calls
|
||||||
|
them in order, passing the `ExitStack` for teardown registration:
|
||||||
|
|
||||||
|
```
|
||||||
|
_allocate_resources(plan, stack) → (loopback_ip, network)
|
||||||
|
```
|
||||||
|
Reserve the loopback alias, create the docker bridge network, register
|
||||||
|
teardown callbacks for both.
|
||||||
|
|
||||||
|
```
|
||||||
|
_mint_certs(plan) → plan
|
||||||
|
```
|
||||||
|
Pipelock TLS init (always). Egress TLS init when `plan.egress_plan.routes`
|
||||||
|
is non-empty. Returns the plan with CA paths filled via
|
||||||
|
`dataclasses.replace`.
|
||||||
|
|
||||||
|
```
|
||||||
|
_start_bundle(plan, network, loopback_ip, stack) → plan
|
||||||
|
```
|
||||||
|
Build the `BundleLaunchSpec`, resolve token env, start the bundle
|
||||||
|
container, register teardown. Returns the plan with `bundle_spec` updated
|
||||||
|
(or unchanged if no plan field carries it — callers consume `bundle_spec`
|
||||||
|
directly from this call's return value if needed).
|
||||||
|
|
||||||
|
```
|
||||||
|
_discover_urls(plan, loopback_ip) → plan
|
||||||
|
```
|
||||||
|
Look up host-side ports for the published container ports; assemble
|
||||||
|
`agent_proxy_url`, `agent_git_gate_host`, `agent_supervise_url`; stamp
|
||||||
|
them onto the plan and into `guest_env`.
|
||||||
|
|
||||||
|
```
|
||||||
|
_launch_vm(plan, agent_from_path, stack) → None
|
||||||
|
```
|
||||||
|
`machine_create` + `force_allowlist` + `machine_start`. Register
|
||||||
|
`machine_stop` and `machine_delete` teardown callbacks on the stack.
|
||||||
|
|
||||||
|
```
|
||||||
|
_init_vm(plan) → None
|
||||||
|
```
|
||||||
|
Filesystem-repair exec (`chown`/`chmod`) followed by
|
||||||
|
`_wait_exec_ready()`.
|
||||||
|
|
||||||
|
`launch()` reduces to:
|
||||||
|
|
||||||
|
```python
|
||||||
|
loopback_ip, network = _allocate_resources(plan, stack)
|
||||||
|
plan = _mint_certs(plan)
|
||||||
|
plan = _start_bundle(plan, network, loopback_ip, stack)
|
||||||
|
plan = _discover_urls(plan, loopback_ip)
|
||||||
|
agent_from_path = _ensure_smolmachine(plan.agent_image_ref,
|
||||||
|
dockerfile=plan.agent_dockerfile_path)
|
||||||
|
_launch_vm(plan, agent_from_path, stack)
|
||||||
|
_init_vm(plan)
|
||||||
|
prompt_path = provision(plan, plan.machine_name)
|
||||||
|
yield SmolmachinesBottle(...)
|
||||||
|
```
|
||||||
|
|
||||||
|
Each helper's inputs and outputs are explicit; each is independently
|
||||||
|
testable with a minimal set of mocks.
|
||||||
|
|
||||||
|
### 2. Replace `time.sleep(1.5)` with `_wait_exec_ready`
|
||||||
|
|
||||||
|
Add to `smolvm.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def wait_exec_ready(name: str, *, timeout: float = 5.0) -> None:
|
||||||
|
"""Poll until `machine exec true` exits 0 or `timeout` elapses.
|
||||||
|
Replaces a fixed sleep after machine_start for the libkrun
|
||||||
|
exec-channel warm-up race."""
|
||||||
|
deadline = time.monotonic() + timeout
|
||||||
|
delay = 0.1
|
||||||
|
while time.monotonic() < deadline:
|
||||||
|
r = machine_exec(name, ["true"])
|
||||||
|
if r.returncode == 0:
|
||||||
|
return
|
||||||
|
remaining = deadline - time.monotonic()
|
||||||
|
if remaining <= 0:
|
||||||
|
break
|
||||||
|
time.sleep(min(delay, remaining))
|
||||||
|
delay = min(delay * 2, 0.5)
|
||||||
|
die(
|
||||||
|
f"smolvm machine {name!r}: exec channel not ready after "
|
||||||
|
f"{timeout:.0f}s — VM may have failed to boot."
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
`_init_vm` calls `wait_exec_ready` after the chown/chmod exec instead of
|
||||||
|
`time.sleep(1.5)`. The `time` import in `launch.py` is removed.
|
||||||
|
|
||||||
|
### 3. File-lock loopback alias allocation
|
||||||
|
|
||||||
|
Add to `loopback_alias.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import fcntl
|
||||||
|
|
||||||
|
_ALLOC_LOCK_PATH = Path.home() / ".cache" / "bot-bottle" / "smolmachines.lock"
|
||||||
|
|
||||||
|
def allocate(slug: str) -> str:
|
||||||
|
if not _is_macos():
|
||||||
|
return "127.0.0.1"
|
||||||
|
_ALLOC_LOCK_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(_ALLOC_LOCK_PATH, "w") as lf:
|
||||||
|
fcntl.flock(lf, fcntl.LOCK_EX)
|
||||||
|
return _allocate_locked(slug)
|
||||||
|
|
||||||
|
def _allocate_locked(slug: str) -> str:
|
||||||
|
in_use = _aliases_in_use()
|
||||||
|
for ip in _pool_addresses():
|
||||||
|
if ip not in in_use:
|
||||||
|
return ip
|
||||||
|
die(...)
|
||||||
|
return ""
|
||||||
|
```
|
||||||
|
|
||||||
|
The lock is held only for the duration of `_aliases_in_use()` + the
|
||||||
|
`allocate` return. The bundle's `docker run` runs after the lock is
|
||||||
|
released. This is sufficient: once `docker run` returns, the container
|
||||||
|
is visible in docker state and future `allocate()` calls will see it.
|
||||||
|
The remaining window (lock released → container appears in docker state)
|
||||||
|
is narrowed from "the entire bringup sequence" to "a single subprocess
|
||||||
|
call," making a collision between two concurrent launches effectively
|
||||||
|
impossible in practice.
|
||||||
|
|
||||||
|
The lock is a no-op on Linux (the `_is_macos()` early-return fires
|
||||||
|
before the lock path is opened).
|
||||||
|
|
||||||
|
## Test impact
|
||||||
|
|
||||||
|
- Unit tests for each extracted helper can mock one subprocess boundary
|
||||||
|
at a time (smolvm, docker, pipelock TLS init) without wiring the full
|
||||||
|
`launch()` ExitStack.
|
||||||
|
- `wait_exec_ready` needs a test with `machine_exec` stubbed to return
|
||||||
|
non-zero N times before 0 — verifies the backoff loop and the timeout
|
||||||
|
die path.
|
||||||
|
- `allocate` tests are unchanged in shape; the lock is acquired and
|
||||||
|
released within the call so tests don't need to be aware of it.
|
||||||
|
|
||||||
|
## Implementation chunks
|
||||||
|
|
||||||
|
1. **PRD (this commit).** Sets the design.
|
||||||
|
2. **Decompose `launch()`.**
|
||||||
|
3. **Replace sleep with `wait_exec_ready`.**
|
||||||
|
4. **File-lock `allocate()`.**
|
||||||
|
5. **Tests.** Unit tests for each helper; `wait_exec_ready` backoff + timeout.
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- Issue #122: Decompose smolmachines launch and harden bringup sequencing.
|
||||||
|
- Issue #117: Complexity hotspots — source of the smolmachines/launch.py finding.
|
||||||
|
- Issue #111: Smolmachine sidecar doesn't reliably get refreshed (separate, not addressed here).
|
||||||
@@ -11,6 +11,7 @@ import json
|
|||||||
import sqlite3
|
import sqlite3
|
||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import threading
|
||||||
import unittest
|
import unittest
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
@@ -144,6 +145,55 @@ class TestAllocate(unittest.TestCase):
|
|||||||
loopback_alias.allocate("demo-overflow")
|
loopback_alias.allocate("demo-overflow")
|
||||||
|
|
||||||
|
|
||||||
|
class TestAllocateLock(unittest.TestCase):
|
||||||
|
"""allocate() on macOS acquires a file lock so concurrent calls
|
||||||
|
serialise rather than racing on docker state."""
|
||||||
|
|
||||||
|
def test_acquires_exclusive_lock_on_macos(self):
|
||||||
|
import fcntl as fcntl_mod
|
||||||
|
flock_calls: list[int] = []
|
||||||
|
|
||||||
|
def record_flock(fd, op):
|
||||||
|
flock_calls.append(op)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
lock_path = Path(tmp) / "smolmachines.lock"
|
||||||
|
with patch.object(loopback_alias, "_is_macos", return_value=True), \
|
||||||
|
patch.object(loopback_alias, "_ALLOC_LOCK_PATH", lock_path), \
|
||||||
|
patch.object(loopback_alias, "_aliases_in_use", return_value=set()), \
|
||||||
|
patch.object(loopback_alias.fcntl, "flock",
|
||||||
|
side_effect=record_flock):
|
||||||
|
loopback_alias.allocate("demo")
|
||||||
|
|
||||||
|
self.assertIn(fcntl_mod.LOCK_EX, flock_calls)
|
||||||
|
|
||||||
|
def test_no_lock_on_linux(self):
|
||||||
|
# Linux early-returns before touching the lock file.
|
||||||
|
with patch.object(loopback_alias, "_is_macos", return_value=False), \
|
||||||
|
patch.object(loopback_alias.fcntl, "flock") as flock:
|
||||||
|
loopback_alias.allocate("demo")
|
||||||
|
flock.assert_not_called()
|
||||||
|
|
||||||
|
def test_sequential_allocations_with_shared_lock_are_serialised(self):
|
||||||
|
# Two sequential calls share the same lock file. The second
|
||||||
|
# call sees {127.0.0.16} in use (as if the first caller's
|
||||||
|
# docker run completed between the two lock acquisitions) and
|
||||||
|
# returns the next alias.
|
||||||
|
in_use_seq = [set(), {"127.0.0.16"}]
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
lock_path = Path(tmp) / "smolmachines.lock"
|
||||||
|
results: list[str] = []
|
||||||
|
for _ in range(2):
|
||||||
|
with patch.object(loopback_alias, "_is_macos", return_value=True), \
|
||||||
|
patch.object(loopback_alias, "_ALLOC_LOCK_PATH", lock_path), \
|
||||||
|
patch.object(loopback_alias, "_aliases_in_use",
|
||||||
|
return_value=in_use_seq.pop(0)):
|
||||||
|
results.append(loopback_alias.allocate("demo"))
|
||||||
|
|
||||||
|
self.assertEqual(["127.0.0.16", "127.0.0.17"], results)
|
||||||
|
|
||||||
|
|
||||||
class TestAliasInUseDetection(unittest.TestCase):
|
class TestAliasInUseDetection(unittest.TestCase):
|
||||||
"""`_aliases_in_use` inspects every running bundle and pulls
|
"""`_aliases_in_use` inspects every running bundle and pulls
|
||||||
each container's port-binding `HostIp` out. The detection has
|
each container's port-binding `HostIp` out. The detection has
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ import unittest
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from bot_bottle.backend.smolmachines import smolvm as smolvm_mod
|
||||||
from bot_bottle.backend.smolmachines.smolvm import (
|
from bot_bottle.backend.smolmachines.smolvm import (
|
||||||
SmolvmError,
|
SmolvmError,
|
||||||
SmolvmRunResult,
|
SmolvmRunResult,
|
||||||
@@ -23,6 +24,7 @@ from bot_bottle.backend.smolmachines.smolvm import (
|
|||||||
machine_start,
|
machine_start,
|
||||||
machine_stop,
|
machine_stop,
|
||||||
pack_create,
|
pack_create,
|
||||||
|
wait_exec_ready,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -204,6 +206,43 @@ class TestErrorPath(unittest.TestCase):
|
|||||||
self.assertEqual(SmolvmRunResult(42, "", "nope"), r)
|
self.assertEqual(SmolvmRunResult(42, "", "nope"), r)
|
||||||
|
|
||||||
|
|
||||||
|
class TestWaitExecReady(unittest.TestCase):
|
||||||
|
"""wait_exec_ready polls machine_exec(name, ["true"]) until it
|
||||||
|
returns 0, then exits. On timeout it calls die()."""
|
||||||
|
|
||||||
|
def test_returns_immediately_when_exec_succeeds_first_try(self):
|
||||||
|
with patch.object(smolvm_mod, "machine_exec",
|
||||||
|
return_value=SmolvmRunResult(0, "", "")) as m:
|
||||||
|
wait_exec_ready("vm-x")
|
||||||
|
m.assert_called_once_with("vm-x", ["true"])
|
||||||
|
|
||||||
|
def test_retries_on_nonzero_and_returns_on_success(self):
|
||||||
|
results = [
|
||||||
|
SmolvmRunResult(1, "", "not ready"),
|
||||||
|
SmolvmRunResult(1, "", "not ready"),
|
||||||
|
SmolvmRunResult(0, "", ""),
|
||||||
|
]
|
||||||
|
with patch.object(smolvm_mod, "machine_exec",
|
||||||
|
side_effect=results) as m, \
|
||||||
|
patch.object(smolvm_mod.time, "sleep"):
|
||||||
|
wait_exec_ready("vm-x")
|
||||||
|
self.assertEqual(3, m.call_count)
|
||||||
|
|
||||||
|
def test_raises_smolvm_error_on_timeout(self):
|
||||||
|
# machine_exec always returns non-zero; monotonic advances past
|
||||||
|
# the deadline after the first sleep so the loop exits.
|
||||||
|
ticks = [0.0, 0.0, 10.0] # third call puts us past deadline
|
||||||
|
with patch.object(smolvm_mod, "machine_exec",
|
||||||
|
return_value=SmolvmRunResult(1, "", "")), \
|
||||||
|
patch.object(smolvm_mod.time, "monotonic",
|
||||||
|
side_effect=ticks), \
|
||||||
|
patch.object(smolvm_mod.time, "sleep"):
|
||||||
|
with self.assertRaises(SmolvmError) as cm:
|
||||||
|
wait_exec_ready("vm-x", timeout=5.0)
|
||||||
|
self.assertIn("vm-x", str(cm.exception))
|
||||||
|
self.assertIn("not ready", str(cm.exception))
|
||||||
|
|
||||||
|
|
||||||
class TestIsAvailable(unittest.TestCase):
|
class TestIsAvailable(unittest.TestCase):
|
||||||
def test_true_when_on_path(self):
|
def test_true_when_on_path(self):
|
||||||
with patch(
|
with patch(
|
||||||
|
|||||||
Reference in New Issue
Block a user
Will this crash the dashboard?
If so, this should be some kind of error we raise instead, and some other handler should decide whether or not it dies.