refactor(smolmachines): decompose launch(), add wait_exec_ready, file-lock allocate() (PRD 0032)
Decompose the 207-line launch() into six named helpers: _allocate_resources, _mint_certs, _start_bundle, _discover_urls, _launch_vm, _init_vm. Each has explicit inputs/outputs and is independently testable. Replace time.sleep(1.5) with smolvm.wait_exec_ready(), which polls `machine exec true` with exponential backoff. Exits as soon as the exec channel is ready; dies loudly with a timeout message instead of silently leaving the VM in an unknown state. File-lock loopback_alias.allocate() with fcntl.flock(LOCK_EX) so concurrent bottle launches can't race on docker state and claim the same alias.
This commit is contained in:
@@ -21,7 +21,6 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import os
|
import os
|
||||||
import time
|
|
||||||
from contextlib import ExitStack, contextmanager
|
from contextlib import ExitStack, contextmanager
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Callable, Generator
|
from typing import Callable, Generator
|
||||||
@@ -94,200 +93,23 @@ def launch(
|
|||||||
via the ExitStack."""
|
via the ExitStack."""
|
||||||
stack = ExitStack()
|
stack = ExitStack()
|
||||||
try:
|
try:
|
||||||
# 1. Reserve a loopback alias for this bottle. macOS only
|
loopback_ip, network = _allocate_resources(plan, stack)
|
||||||
# routes 127.0.0.1 by default; the per-bottle alias is
|
plan = _mint_certs(plan)
|
||||||
# what bundles the docker port-publishes and TSI allowlist
|
plan = _start_bundle(plan, network, loopback_ip, stack)
|
||||||
# against, so this bottle can't reach other bottles' (or
|
plan = _discover_urls(plan, loopback_ip)
|
||||||
# other host services') ports on the loopback. Lazy
|
|
||||||
# sudo-driven on first use per boot. No-op on Linux.
|
|
||||||
_loopback.ensure_pool()
|
|
||||||
loopback_ip = _loopback.allocate(plan.slug)
|
|
||||||
|
|
||||||
# 2. Per-bottle docker bridge.
|
# Build the agent image and pack it into a `.smolmachine`
|
||||||
network = _bundle.bundle_network_name(plan.slug)
|
# artifact (or hit the per-Dockerfile-digest cache). Runs
|
||||||
_bundle.create_bundle_network(network, plan.bundle_subnet, plan.bundle_gateway)
|
# here, not in prepare, so the docker-build output doesn't
|
||||||
stack.callback(_bundle.remove_bundle_network, network)
|
# garble the dashboard's preflight modal.
|
||||||
|
|
||||||
# 2. Mint per-bottle CAs and update the inner Plans with
|
|
||||||
# their launch-time paths. pipelock always runs in the
|
|
||||||
# bundle; egress's CA is only minted when the bottle
|
|
||||||
# declares routes (otherwise egress runs idle without
|
|
||||||
# MITM and the CA files would be unused).
|
|
||||||
ca_cert_host, ca_key_host = pipelock_tls_init(plan.proxy_plan.yaml_path.parent)
|
|
||||||
proxy_plan = dataclasses.replace(
|
|
||||||
plan.proxy_plan,
|
|
||||||
ca_cert_host_path=ca_cert_host,
|
|
||||||
ca_key_host_path=ca_key_host,
|
|
||||||
)
|
|
||||||
egress_plan = plan.egress_plan
|
|
||||||
if egress_plan.routes:
|
|
||||||
egress_ca_host, egress_ca_cert_only = egress_tls_init(
|
|
||||||
plan.egress_plan.routes_path.parent,
|
|
||||||
)
|
|
||||||
egress_plan = dataclasses.replace(
|
|
||||||
egress_plan,
|
|
||||||
mitmproxy_ca_host_path=egress_ca_host,
|
|
||||||
mitmproxy_ca_cert_only_host_path=egress_ca_cert_only,
|
|
||||||
pipelock_ca_host_path=ca_cert_host,
|
|
||||||
# On smolmachines, egress's upstream is pipelock
|
|
||||||
# on the bundle's localhost — they're in the same
|
|
||||||
# container's network namespace.
|
|
||||||
pipelock_proxy_url=BUNDLE_LOCAL_PIPELOCK_URL,
|
|
||||||
)
|
|
||||||
plan = dataclasses.replace(
|
|
||||||
plan, proxy_plan=proxy_plan, egress_plan=egress_plan,
|
|
||||||
)
|
|
||||||
|
|
||||||
# 3. Build the BundleLaunchSpec from the (now-resolved)
|
|
||||||
# inner Plans: daemon subset, env, bind-mounts, and the
|
|
||||||
# loopback alias to bind published ports against. The
|
|
||||||
# spec's ports_to_publish list expands depending on which
|
|
||||||
# daemons the agent needs to reach from the smolvm guest.
|
|
||||||
bundle_spec = _bundle_launch_spec(plan, network, loopback_ip)
|
|
||||||
token_env = _resolve_token_env(plan, dict(os.environ))
|
|
||||||
_bundle.ensure_bundle_image(bundle_spec.image)
|
|
||||||
_bundle.start_bundle(bundle_spec, env={**os.environ, **token_env})
|
|
||||||
stack.callback(_bundle.stop_bundle, plan.slug)
|
|
||||||
|
|
||||||
# 4. Discover the host-side ports docker assigned for the
|
|
||||||
# bundle's published container ports, and bind the
|
|
||||||
# agent's URLs to `<loopback_ip>:<host port>`. Docker
|
|
||||||
# container IPs (192.168.x.x in the daemon's bridge)
|
|
||||||
# aren't reachable from the smolvm guest on macOS — TSI
|
|
||||||
# uses macOS networking, and macOS sees the daemon's
|
|
||||||
# bridge via the published-port loopback forward only.
|
|
||||||
#
|
|
||||||
# Proxy hop order matches the docker backend: when the
|
|
||||||
# bottle declares egress routes, the agent's first hop is
|
|
||||||
# egress (for token injection), then pipelock. Without
|
|
||||||
# routes, the agent dials pipelock directly. Whichever
|
|
||||||
# one is "agent-facing" is the daemon whose port we
|
|
||||||
# publish on host loopback; the other stays bundle-
|
|
||||||
# internal as the upstream proxy.
|
|
||||||
if plan.egress_plan.routes:
|
|
||||||
agent_facing_port = _EGRESS_PORT
|
|
||||||
else:
|
|
||||||
agent_facing_port = _PIPELOCK_PORT
|
|
||||||
agent_facing_host_port = _bundle.bundle_host_port(
|
|
||||||
plan.slug, agent_facing_port, host_ip=loopback_ip,
|
|
||||||
)
|
|
||||||
agent_proxy_url = f"http://{loopback_ip}:{agent_facing_host_port}"
|
|
||||||
agent_git_gate_host = ""
|
|
||||||
if plan.git_gate_plan.upstreams:
|
|
||||||
git_gate_host_port = _bundle.bundle_host_port(
|
|
||||||
plan.slug, _GIT_HTTP_PORT, host_ip=loopback_ip,
|
|
||||||
)
|
|
||||||
agent_git_gate_host = f"{loopback_ip}:{git_gate_host_port}"
|
|
||||||
agent_supervise_url = ""
|
|
||||||
if plan.supervise_plan is not None:
|
|
||||||
supervise_host_port = _bundle.bundle_host_port(
|
|
||||||
plan.slug, _SUPERVISE_PORT, host_ip=loopback_ip,
|
|
||||||
)
|
|
||||||
agent_supervise_url = f"http://{loopback_ip}:{supervise_host_port}/"
|
|
||||||
|
|
||||||
# Stamp the URLs onto the plan + guest_env. provision_git
|
|
||||||
# and provision_supervise read the plan fields; the agent
|
|
||||||
# reads guest_env on every exec_agent.
|
|
||||||
#
|
|
||||||
# NO_PROXY has to include the per-bottle loopback alias —
|
|
||||||
# otherwise claude's HTTPS_PROXY catches direct calls to
|
|
||||||
# the supervise URL (`http://<alias>:<port>/`) and proxies
|
|
||||||
# them through egress, which has no route for the alias
|
|
||||||
# and rejects with "Failed to connect". The smolmachines
|
|
||||||
# git-gate URL uses smart HTTP, so it also has to bypass
|
|
||||||
# the agent's HTTP_PROXY and go straight to the host-
|
|
||||||
# published git HTTP endpoint. Append rather than overwrite
|
|
||||||
# so prepare.py's
|
|
||||||
# `localhost,127.0.0.1` baseline stays in place.
|
|
||||||
existing_no_proxy = plan.guest_env.get("NO_PROXY", "localhost,127.0.0.1")
|
|
||||||
guest_env = {
|
|
||||||
**plan.guest_env,
|
|
||||||
"HTTPS_PROXY": agent_proxy_url,
|
|
||||||
"HTTP_PROXY": agent_proxy_url,
|
|
||||||
"NO_PROXY": f"{existing_no_proxy},{loopback_ip}",
|
|
||||||
}
|
|
||||||
if agent_git_gate_host:
|
|
||||||
guest_env["GIT_GATE_URL"] = f"http://{agent_git_gate_host}"
|
|
||||||
if agent_supervise_url:
|
|
||||||
guest_env["MCP_SUPERVISE_URL"] = agent_supervise_url
|
|
||||||
plan = dataclasses.replace(
|
|
||||||
plan,
|
|
||||||
guest_env=guest_env,
|
|
||||||
agent_proxy_url=agent_proxy_url,
|
|
||||||
agent_git_gate_host=agent_git_gate_host,
|
|
||||||
agent_supervise_url=agent_supervise_url,
|
|
||||||
)
|
|
||||||
|
|
||||||
# 5. Build the agent image and pack it into a
|
|
||||||
# `.smolmachine` artifact (or hit the per-Dockerfile-digest
|
|
||||||
# cache). Runs here, not in prepare, so the docker-build
|
|
||||||
# output doesn't garble the dashboard's preflight modal:
|
|
||||||
# both the curses-endwin path and the tmux pane-routing
|
|
||||||
# path redirect stderr around `launch` already.
|
|
||||||
agent_from_path = _ensure_smolmachine(
|
agent_from_path = _ensure_smolmachine(
|
||||||
plan.agent_image_ref,
|
plan.agent_image_ref,
|
||||||
dockerfile=plan.agent_dockerfile_path,
|
dockerfile=plan.agent_dockerfile_path,
|
||||||
)
|
)
|
||||||
|
|
||||||
# smolvm VM. --from carries the pre-packed .smolmachine
|
_launch_vm(plan, agent_from_path, loopback_ip, stack)
|
||||||
# artifact; --allow-cidr + -e carry the per-bottle TSI
|
_init_vm(plan)
|
||||||
# allowlist + env. The allowlist is the per-bottle
|
|
||||||
# loopback alias — narrowing it to one /32 keeps the
|
|
||||||
# agent from reaching other host loopback services or
|
|
||||||
# other bottles' published ports. Smolfile isn't usable
|
|
||||||
# here — smolvm 0.8.0 makes `--from` and `--smolfile`
|
|
||||||
# mutually exclusive.
|
|
||||||
_smolvm.machine_create(
|
|
||||||
plan.machine_name,
|
|
||||||
from_path=agent_from_path,
|
|
||||||
allow_cidrs=[f"{loopback_ip}/32"],
|
|
||||||
env=plan.guest_env,
|
|
||||||
)
|
|
||||||
stack.callback(_smolvm.machine_delete, plan.machine_name)
|
|
||||||
# Workaround smolvm 0.8.0: `--allow-cidr` is silently
|
|
||||||
# dropped when combined with `--from`. Patch the persisted
|
|
||||||
# state DB to set the allowlist before start so the booted
|
|
||||||
# VM's TSI actually enforces. See loopback_alias's module
|
|
||||||
# docstring for the investigation that led here.
|
|
||||||
_loopback.force_allowlist(plan.machine_name, [f"{loopback_ip}/32"])
|
|
||||||
_smolvm.machine_start(plan.machine_name)
|
|
||||||
stack.callback(_smolvm.machine_stop, plan.machine_name)
|
|
||||||
|
|
||||||
# 6. Repair filesystem ownership + perms that smolvm's
|
|
||||||
# pack process remapped to the host invoker's uid (501
|
|
||||||
# on macOS) rather than preserving the image's expected
|
|
||||||
# ownership.
|
|
||||||
#
|
|
||||||
# - /home/node → node:node so the node user can write
|
|
||||||
# its own dotfiles (claude appendFileSync on
|
|
||||||
# ~/.claude.json otherwise bails with ENOENT/EPERM
|
|
||||||
# and the TUI hangs without surfacing the error).
|
|
||||||
# - /tmp + /var/tmp → root:root mode 1777 so non-root
|
|
||||||
# processes can create their per-uid scratch dirs
|
|
||||||
# (claude-code creates /tmp/claude-<uid>/ as soon as
|
|
||||||
# it spawns a Bash tool call).
|
|
||||||
#
|
|
||||||
# All folded into one sh -c so we only pay one
|
|
||||||
# machine_exec round trip — back-to-back exec calls
|
|
||||||
# right after machine_start hit a SIGKILL race in
|
|
||||||
# libkrun's exec channel (see provision_ca for the
|
|
||||||
# other half of this same workaround).
|
|
||||||
_smolvm.machine_exec(plan.machine_name, [
|
|
||||||
"sh", "-c",
|
|
||||||
"chown -R node:node /home/node && "
|
|
||||||
"chown root:root /tmp /var/tmp && "
|
|
||||||
"chmod 1777 /tmp /var/tmp",
|
|
||||||
])
|
|
||||||
|
|
||||||
# Wait briefly for the VM to settle. Back-to-back smolvm
|
|
||||||
# machine_exec calls immediately after machine_start
|
|
||||||
# occasionally SIGKILL the in-VM child at ~100ms (looks
|
|
||||||
# like a VM warm-up race in libkrun's exec channel).
|
|
||||||
# 1.5s is empirically enough to dodge it; provisioning
|
|
||||||
# already takes seconds so the wait is amortized.
|
|
||||||
time.sleep(1.5)
|
|
||||||
|
|
||||||
# 7. Provision (CA / prompt / skills / git / supervise).
|
|
||||||
prompt_path = provision(plan, plan.machine_name)
|
prompt_path = provision(plan, plan.machine_name)
|
||||||
|
|
||||||
yield SmolmachinesBottle(
|
yield SmolmachinesBottle(
|
||||||
@@ -301,6 +123,180 @@ def launch(
|
|||||||
stack.close()
|
stack.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _allocate_resources(
|
||||||
|
plan: SmolmachinesBottlePlan,
|
||||||
|
stack: ExitStack,
|
||||||
|
) -> tuple[str, str]:
|
||||||
|
"""Reserve a loopback alias and create the per-bottle docker bridge.
|
||||||
|
|
||||||
|
macOS only routes 127.0.0.1 by default; the per-bottle alias
|
||||||
|
scopes TSI's allowlist to this bottle's published ports so the
|
||||||
|
agent can't reach other bottles' or host services' ports on
|
||||||
|
loopback. No-op on Linux."""
|
||||||
|
_loopback.ensure_pool()
|
||||||
|
loopback_ip = _loopback.allocate(plan.slug)
|
||||||
|
network = _bundle.bundle_network_name(plan.slug)
|
||||||
|
_bundle.create_bundle_network(network, plan.bundle_subnet, plan.bundle_gateway)
|
||||||
|
stack.callback(_bundle.remove_bundle_network, network)
|
||||||
|
return loopback_ip, network
|
||||||
|
|
||||||
|
|
||||||
|
def _mint_certs(plan: SmolmachinesBottlePlan) -> SmolmachinesBottlePlan:
|
||||||
|
"""Mint per-bottle CAs and return the plan with CA paths filled.
|
||||||
|
|
||||||
|
Pipelock always runs in the bundle. Egress's CA is only minted
|
||||||
|
when the bottle declares routes — otherwise egress runs idle
|
||||||
|
without MITM and the CA files would be unused."""
|
||||||
|
ca_cert_host, ca_key_host = pipelock_tls_init(plan.proxy_plan.yaml_path.parent)
|
||||||
|
proxy_plan = dataclasses.replace(
|
||||||
|
plan.proxy_plan,
|
||||||
|
ca_cert_host_path=ca_cert_host,
|
||||||
|
ca_key_host_path=ca_key_host,
|
||||||
|
)
|
||||||
|
egress_plan = plan.egress_plan
|
||||||
|
if egress_plan.routes:
|
||||||
|
egress_ca_host, egress_ca_cert_only = egress_tls_init(
|
||||||
|
plan.egress_plan.routes_path.parent,
|
||||||
|
)
|
||||||
|
egress_plan = dataclasses.replace(
|
||||||
|
egress_plan,
|
||||||
|
mitmproxy_ca_host_path=egress_ca_host,
|
||||||
|
mitmproxy_ca_cert_only_host_path=egress_ca_cert_only,
|
||||||
|
pipelock_ca_host_path=ca_cert_host,
|
||||||
|
# On smolmachines, egress's upstream is pipelock on the
|
||||||
|
# bundle's localhost — they're in the same container's
|
||||||
|
# network namespace.
|
||||||
|
pipelock_proxy_url=BUNDLE_LOCAL_PIPELOCK_URL,
|
||||||
|
)
|
||||||
|
return dataclasses.replace(plan, proxy_plan=proxy_plan, egress_plan=egress_plan)
|
||||||
|
|
||||||
|
|
||||||
|
def _start_bundle(
|
||||||
|
plan: SmolmachinesBottlePlan,
|
||||||
|
network: str,
|
||||||
|
loopback_ip: str,
|
||||||
|
stack: ExitStack,
|
||||||
|
) -> SmolmachinesBottlePlan:
|
||||||
|
"""Build the BundleLaunchSpec, resolve token env, start the
|
||||||
|
sidecar bundle container, and register teardown."""
|
||||||
|
bundle_spec = _bundle_launch_spec(plan, network, loopback_ip)
|
||||||
|
token_env = _resolve_token_env(plan, dict(os.environ))
|
||||||
|
_bundle.ensure_bundle_image(bundle_spec.image)
|
||||||
|
_bundle.start_bundle(bundle_spec, env={**os.environ, **token_env})
|
||||||
|
stack.callback(_bundle.stop_bundle, plan.slug)
|
||||||
|
return plan
|
||||||
|
|
||||||
|
|
||||||
|
def _discover_urls(
|
||||||
|
plan: SmolmachinesBottlePlan,
|
||||||
|
loopback_ip: str,
|
||||||
|
) -> SmolmachinesBottlePlan:
|
||||||
|
"""Discover host-side ports for published container ports and
|
||||||
|
return the plan with URLs + guest_env stamped in.
|
||||||
|
|
||||||
|
Docker container IPs (192.168.x.x in the daemon's bridge)
|
||||||
|
aren't reachable from the smolvm guest on macOS — TSI uses
|
||||||
|
macOS networking, and macOS sees the daemon's bridge via the
|
||||||
|
published-port loopback forward only.
|
||||||
|
|
||||||
|
Proxy hop order: when the bottle declares egress routes, the
|
||||||
|
agent's first hop is egress (for token injection), then
|
||||||
|
pipelock. Without routes, the agent dials pipelock directly.
|
||||||
|
NO_PROXY includes the per-bottle loopback alias so the
|
||||||
|
supervise + git-gate URLs bypass HTTPS_PROXY."""
|
||||||
|
if plan.egress_plan.routes:
|
||||||
|
agent_facing_port = _EGRESS_PORT
|
||||||
|
else:
|
||||||
|
agent_facing_port = _PIPELOCK_PORT
|
||||||
|
agent_facing_host_port = _bundle.bundle_host_port(
|
||||||
|
plan.slug, agent_facing_port, host_ip=loopback_ip,
|
||||||
|
)
|
||||||
|
agent_proxy_url = f"http://{loopback_ip}:{agent_facing_host_port}"
|
||||||
|
|
||||||
|
agent_git_gate_host = ""
|
||||||
|
if plan.git_gate_plan.upstreams:
|
||||||
|
git_gate_host_port = _bundle.bundle_host_port(
|
||||||
|
plan.slug, _GIT_HTTP_PORT, host_ip=loopback_ip,
|
||||||
|
)
|
||||||
|
agent_git_gate_host = f"{loopback_ip}:{git_gate_host_port}"
|
||||||
|
|
||||||
|
agent_supervise_url = ""
|
||||||
|
if plan.supervise_plan is not None:
|
||||||
|
supervise_host_port = _bundle.bundle_host_port(
|
||||||
|
plan.slug, _SUPERVISE_PORT, host_ip=loopback_ip,
|
||||||
|
)
|
||||||
|
agent_supervise_url = f"http://{loopback_ip}:{supervise_host_port}/"
|
||||||
|
|
||||||
|
existing_no_proxy = plan.guest_env.get("NO_PROXY", "localhost,127.0.0.1")
|
||||||
|
guest_env = {
|
||||||
|
**plan.guest_env,
|
||||||
|
"HTTPS_PROXY": agent_proxy_url,
|
||||||
|
"HTTP_PROXY": agent_proxy_url,
|
||||||
|
"NO_PROXY": f"{existing_no_proxy},{loopback_ip}",
|
||||||
|
}
|
||||||
|
if agent_git_gate_host:
|
||||||
|
guest_env["GIT_GATE_URL"] = f"http://{agent_git_gate_host}"
|
||||||
|
if agent_supervise_url:
|
||||||
|
guest_env["MCP_SUPERVISE_URL"] = agent_supervise_url
|
||||||
|
|
||||||
|
return dataclasses.replace(
|
||||||
|
plan,
|
||||||
|
guest_env=guest_env,
|
||||||
|
agent_proxy_url=agent_proxy_url,
|
||||||
|
agent_git_gate_host=agent_git_gate_host,
|
||||||
|
agent_supervise_url=agent_supervise_url,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _launch_vm(
|
||||||
|
plan: SmolmachinesBottlePlan,
|
||||||
|
agent_from_path: Path,
|
||||||
|
loopback_ip: str,
|
||||||
|
stack: ExitStack,
|
||||||
|
) -> None:
|
||||||
|
"""Create, patch, and start the smolvm VM; register teardown.
|
||||||
|
|
||||||
|
--allow-cidr is the per-bottle loopback alias so the guest can
|
||||||
|
only reach this bottle's bundle ports. force_allowlist patches
|
||||||
|
smolvm 0.8.0's silent-drop of --allow-cidr when combined with
|
||||||
|
--from. Smolfile isn't usable here — smolvm 0.8.0 makes --from
|
||||||
|
and --smolfile mutually exclusive."""
|
||||||
|
_smolvm.machine_create(
|
||||||
|
plan.machine_name,
|
||||||
|
from_path=agent_from_path,
|
||||||
|
allow_cidrs=[f"{loopback_ip}/32"],
|
||||||
|
env=plan.guest_env,
|
||||||
|
)
|
||||||
|
stack.callback(_smolvm.machine_delete, plan.machine_name)
|
||||||
|
# Workaround smolvm 0.8.0: `--allow-cidr` is silently dropped
|
||||||
|
# when combined with `--from`. Patch the persisted state DB
|
||||||
|
# before start so the booted VM's TSI actually enforces.
|
||||||
|
_loopback.force_allowlist(plan.machine_name, [f"{loopback_ip}/32"])
|
||||||
|
_smolvm.machine_start(plan.machine_name)
|
||||||
|
stack.callback(_smolvm.machine_stop, plan.machine_name)
|
||||||
|
|
||||||
|
|
||||||
|
def _init_vm(plan: SmolmachinesBottlePlan) -> None:
|
||||||
|
"""Repair filesystem ownership and wait for exec channel readiness.
|
||||||
|
|
||||||
|
Ownership repair: smolvm's pack process remaps files to the host
|
||||||
|
invoker's uid (501 on macOS). /home/node must be node:node so
|
||||||
|
Claude Code can write ~/.claude.json; /tmp + /var/tmp need root
|
||||||
|
mode 1777 so non-root processes can create per-uid scratch dirs.
|
||||||
|
All folded into one sh -c to avoid back-to-back exec calls
|
||||||
|
immediately after machine_start (libkrun exec-channel race).
|
||||||
|
|
||||||
|
wait_exec_ready polls until the exec channel is ready for the
|
||||||
|
subsequent provision calls, replacing the empirical sleep."""
|
||||||
|
_smolvm.machine_exec(plan.machine_name, [
|
||||||
|
"sh", "-c",
|
||||||
|
"chown -R node:node /home/node && "
|
||||||
|
"chown root:root /tmp /var/tmp && "
|
||||||
|
"chmod 1777 /tmp /var/tmp",
|
||||||
|
])
|
||||||
|
_smolvm.wait_exec_ready(plan.machine_name)
|
||||||
|
|
||||||
|
|
||||||
def _bundle_launch_spec(
|
def _bundle_launch_spec(
|
||||||
plan: SmolmachinesBottlePlan, network: str, loopback_ip: str,
|
plan: SmolmachinesBottlePlan, network: str, loopback_ip: str,
|
||||||
) -> _bundle.BundleLaunchSpec:
|
) -> _bundle.BundleLaunchSpec:
|
||||||
@@ -324,10 +320,9 @@ def _bundle_launch_spec(
|
|||||||
# is "agent-facing" gets its port published on the host
|
# is "agent-facing" gets its port published on the host
|
||||||
# loopback (see `_ensure_smolmachine`'s discovery loop) and the
|
# loopback (see `_ensure_smolmachine`'s discovery loop) and the
|
||||||
# other stays bundle-internal. The bundle is NOT reachable by
|
# other stays bundle-internal. The bundle is NOT reachable by
|
||||||
# bridge IP from the smolvm guest, so the
|
# bridge IP from the smolvm guest on macOS — TSI uses macOS
|
||||||
# PRD-0023-chunk-3 EGRESS_LISTEN_HOST=127.0.0.1 mitigation
|
# networking, and macOS sees the daemon's bridge via the
|
||||||
# isn't needed: the agent can only dial whatever daemon's
|
# published-port loopback forward only.
|
||||||
# host port we publish, period.
|
|
||||||
|
|
||||||
# --- pipelock ---------------------------------------------
|
# --- pipelock ---------------------------------------------
|
||||||
pp = plan.proxy_plan
|
pp = plan.proxy_plan
|
||||||
|
|||||||
@@ -45,6 +45,7 @@ alias gets handed to a new bottle."""
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import fcntl
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import platform
|
import platform
|
||||||
@@ -83,6 +84,14 @@ _POOL_START = 16
|
|||||||
_POOL_END = 31 # inclusive
|
_POOL_END = 31 # inclusive
|
||||||
|
|
||||||
|
|
||||||
|
# File lock that serialises concurrent allocate() calls so two
|
||||||
|
# simultaneous launches can't read the same docker state and claim
|
||||||
|
# the same alias. Narrowed to the allocate() call itself; docker run
|
||||||
|
# runs after the lock is released. Once the container is running it
|
||||||
|
# appears in docker state and future allocate() calls will see it.
|
||||||
|
_ALLOC_LOCK_PATH = Path.home() / ".cache" / "bot-bottle" / "smolmachines.lock"
|
||||||
|
|
||||||
|
|
||||||
# Loopback aliases pool: 127.0.0.<start>..127.0.0.<end>.
|
# Loopback aliases pool: 127.0.0.<start>..127.0.0.<end>.
|
||||||
def _pool_addresses() -> list[str]:
|
def _pool_addresses() -> list[str]:
|
||||||
return [f"127.0.0.{i}" for i in range(_POOL_START, _POOL_END + 1)]
|
return [f"127.0.0.{i}" for i in range(_POOL_START, _POOL_END + 1)]
|
||||||
@@ -179,9 +188,20 @@ def allocate(slug: str) -> str:
|
|||||||
On non-macOS the whole `127.0.0.0/8` is loopback by default;
|
On non-macOS the whole `127.0.0.0/8` is loopback by default;
|
||||||
`127.0.0.1` is fine to share and we skip the alias dance.
|
`127.0.0.1` is fine to share and we skip the alias dance.
|
||||||
This still returns a deterministic address so launch.py's
|
This still returns a deterministic address so launch.py's
|
||||||
callers don't have to branch on platform."""
|
callers don't have to branch on platform.
|
||||||
|
|
||||||
|
An exclusive file lock serialises concurrent calls so two
|
||||||
|
simultaneous launches don't read the same docker state and
|
||||||
|
claim the same alias."""
|
||||||
if not _is_macos():
|
if not _is_macos():
|
||||||
return "127.0.0.1"
|
return "127.0.0.1"
|
||||||
|
_ALLOC_LOCK_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(_ALLOC_LOCK_PATH, "w") as lf:
|
||||||
|
fcntl.flock(lf, fcntl.LOCK_EX)
|
||||||
|
return _allocate_locked()
|
||||||
|
|
||||||
|
|
||||||
|
def _allocate_locked() -> str:
|
||||||
in_use = _aliases_in_use()
|
in_use = _aliases_in_use()
|
||||||
for ip in _pool_addresses():
|
for ip in _pool_addresses():
|
||||||
if ip not in in_use:
|
if ip not in in_use:
|
||||||
|
|||||||
@@ -27,10 +27,13 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import time
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Mapping, Sequence
|
from typing import Mapping, Sequence
|
||||||
|
|
||||||
|
from ...log import die
|
||||||
|
|
||||||
|
|
||||||
_SMOLVM = "smolvm"
|
_SMOLVM = "smolvm"
|
||||||
|
|
||||||
@@ -197,6 +200,30 @@ def machine_exec(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def wait_exec_ready(name: str, *, timeout: float = 5.0) -> None:
|
||||||
|
"""Poll `machine exec true` until exit 0 or `timeout` elapses.
|
||||||
|
|
||||||
|
Replaces `time.sleep(1.5)` after `machine_start`: libkrun's exec
|
||||||
|
channel needs a brief warm-up before back-to-back exec calls are
|
||||||
|
safe. Polling exits as soon as the channel is ready and fails
|
||||||
|
loudly if the VM never responds."""
|
||||||
|
deadline = time.monotonic() + timeout
|
||||||
|
delay = 0.1
|
||||||
|
while time.monotonic() < deadline:
|
||||||
|
r = machine_exec(name, ["true"])
|
||||||
|
if r.returncode == 0:
|
||||||
|
return
|
||||||
|
remaining = deadline - time.monotonic()
|
||||||
|
if remaining <= 0:
|
||||||
|
break
|
||||||
|
time.sleep(min(delay, remaining))
|
||||||
|
delay = min(delay * 2, 0.5)
|
||||||
|
die(
|
||||||
|
f"smolvm machine {name!r}: exec channel not ready after "
|
||||||
|
f"{timeout:.0f}s — VM may have failed to boot."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def machine_cp(src: str, dst: str) -> None:
|
def machine_cp(src: str, dst: str) -> None:
|
||||||
"""`smolvm machine cp SRC DST`. Path syntax: `machine:path` to
|
"""`smolvm machine cp SRC DST`. Path syntax: `machine:path` to
|
||||||
reference a path inside the VM, bare path for the host. Both
|
reference a path inside the VM, bare path for the host. Both
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import json
|
|||||||
import sqlite3
|
import sqlite3
|
||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import threading
|
||||||
import unittest
|
import unittest
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
@@ -144,6 +145,55 @@ class TestAllocate(unittest.TestCase):
|
|||||||
loopback_alias.allocate("demo-overflow")
|
loopback_alias.allocate("demo-overflow")
|
||||||
|
|
||||||
|
|
||||||
|
class TestAllocateLock(unittest.TestCase):
|
||||||
|
"""allocate() on macOS acquires a file lock so concurrent calls
|
||||||
|
serialise rather than racing on docker state."""
|
||||||
|
|
||||||
|
def test_acquires_exclusive_lock_on_macos(self):
|
||||||
|
import fcntl as fcntl_mod
|
||||||
|
flock_calls: list[int] = []
|
||||||
|
|
||||||
|
def record_flock(fd, op):
|
||||||
|
flock_calls.append(op)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
lock_path = Path(tmp) / "smolmachines.lock"
|
||||||
|
with patch.object(loopback_alias, "_is_macos", return_value=True), \
|
||||||
|
patch.object(loopback_alias, "_ALLOC_LOCK_PATH", lock_path), \
|
||||||
|
patch.object(loopback_alias, "_aliases_in_use", return_value=set()), \
|
||||||
|
patch.object(loopback_alias.fcntl, "flock",
|
||||||
|
side_effect=record_flock):
|
||||||
|
loopback_alias.allocate("demo")
|
||||||
|
|
||||||
|
self.assertIn(fcntl_mod.LOCK_EX, flock_calls)
|
||||||
|
|
||||||
|
def test_no_lock_on_linux(self):
|
||||||
|
# Linux early-returns before touching the lock file.
|
||||||
|
with patch.object(loopback_alias, "_is_macos", return_value=False), \
|
||||||
|
patch.object(loopback_alias.fcntl, "flock") as flock:
|
||||||
|
loopback_alias.allocate("demo")
|
||||||
|
flock.assert_not_called()
|
||||||
|
|
||||||
|
def test_sequential_allocations_with_shared_lock_are_serialised(self):
|
||||||
|
# Two sequential calls share the same lock file. The second
|
||||||
|
# call sees {127.0.0.16} in use (as if the first caller's
|
||||||
|
# docker run completed between the two lock acquisitions) and
|
||||||
|
# returns the next alias.
|
||||||
|
in_use_seq = [set(), {"127.0.0.16"}]
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
lock_path = Path(tmp) / "smolmachines.lock"
|
||||||
|
results: list[str] = []
|
||||||
|
for _ in range(2):
|
||||||
|
with patch.object(loopback_alias, "_is_macos", return_value=True), \
|
||||||
|
patch.object(loopback_alias, "_ALLOC_LOCK_PATH", lock_path), \
|
||||||
|
patch.object(loopback_alias, "_aliases_in_use",
|
||||||
|
return_value=in_use_seq.pop(0)):
|
||||||
|
results.append(loopback_alias.allocate("demo"))
|
||||||
|
|
||||||
|
self.assertEqual(["127.0.0.16", "127.0.0.17"], results)
|
||||||
|
|
||||||
|
|
||||||
class TestAliasInUseDetection(unittest.TestCase):
|
class TestAliasInUseDetection(unittest.TestCase):
|
||||||
"""`_aliases_in_use` inspects every running bundle and pulls
|
"""`_aliases_in_use` inspects every running bundle and pulls
|
||||||
each container's port-binding `HostIp` out. The detection has
|
each container's port-binding `HostIp` out. The detection has
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ import unittest
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from bot_bottle.backend.smolmachines import smolvm as smolvm_mod
|
||||||
from bot_bottle.backend.smolmachines.smolvm import (
|
from bot_bottle.backend.smolmachines.smolvm import (
|
||||||
SmolvmError,
|
SmolvmError,
|
||||||
SmolvmRunResult,
|
SmolvmRunResult,
|
||||||
@@ -23,6 +24,7 @@ from bot_bottle.backend.smolmachines.smolvm import (
|
|||||||
machine_start,
|
machine_start,
|
||||||
machine_stop,
|
machine_stop,
|
||||||
pack_create,
|
pack_create,
|
||||||
|
wait_exec_ready,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -204,6 +206,45 @@ class TestErrorPath(unittest.TestCase):
|
|||||||
self.assertEqual(SmolvmRunResult(42, "", "nope"), r)
|
self.assertEqual(SmolvmRunResult(42, "", "nope"), r)
|
||||||
|
|
||||||
|
|
||||||
|
class TestWaitExecReady(unittest.TestCase):
|
||||||
|
"""wait_exec_ready polls machine_exec(name, ["true"]) until it
|
||||||
|
returns 0, then exits. On timeout it calls die()."""
|
||||||
|
|
||||||
|
def test_returns_immediately_when_exec_succeeds_first_try(self):
|
||||||
|
with patch.object(smolvm_mod, "machine_exec",
|
||||||
|
return_value=SmolvmRunResult(0, "", "")) as m:
|
||||||
|
wait_exec_ready("vm-x")
|
||||||
|
m.assert_called_once_with("vm-x", ["true"])
|
||||||
|
|
||||||
|
def test_retries_on_nonzero_and_returns_on_success(self):
|
||||||
|
results = [
|
||||||
|
SmolvmRunResult(1, "", "not ready"),
|
||||||
|
SmolvmRunResult(1, "", "not ready"),
|
||||||
|
SmolvmRunResult(0, "", ""),
|
||||||
|
]
|
||||||
|
with patch.object(smolvm_mod, "machine_exec",
|
||||||
|
side_effect=results) as m, \
|
||||||
|
patch.object(smolvm_mod.time, "sleep"):
|
||||||
|
wait_exec_ready("vm-x")
|
||||||
|
self.assertEqual(3, m.call_count)
|
||||||
|
|
||||||
|
def test_dies_on_timeout(self):
|
||||||
|
# machine_exec always returns non-zero; monotonic advances past
|
||||||
|
# the deadline after the first sleep so the loop exits.
|
||||||
|
ticks = [0.0, 0.0, 10.0] # third call puts us past deadline
|
||||||
|
with patch.object(smolvm_mod, "machine_exec",
|
||||||
|
return_value=SmolvmRunResult(1, "", "")), \
|
||||||
|
patch.object(smolvm_mod.time, "monotonic",
|
||||||
|
side_effect=ticks), \
|
||||||
|
patch.object(smolvm_mod.time, "sleep"), \
|
||||||
|
patch.object(smolvm_mod, "die",
|
||||||
|
side_effect=SystemExit("die")) as die_mock:
|
||||||
|
with self.assertRaises(SystemExit):
|
||||||
|
wait_exec_ready("vm-x", timeout=5.0)
|
||||||
|
die_mock.assert_called_once()
|
||||||
|
self.assertIn("vm-x", die_mock.call_args.args[0])
|
||||||
|
|
||||||
|
|
||||||
class TestIsAvailable(unittest.TestCase):
|
class TestIsAvailable(unittest.TestCase):
|
||||||
def test_true_when_on_path(self):
|
def test_true_when_on_path(self):
|
||||||
with patch(
|
with patch(
|
||||||
|
|||||||
Reference in New Issue
Block a user